from google.colab import drive
drive.mount('/content/drive')cd 'drive/My Drive/vitalSigns/vs_acs'ls
Import Modules & Construct Path Handlers
import os
import sys
import pandas as pd
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)# Find Relative Path to Files
def findFile(root, file):
for d, subD, f in os.walk(root):
if file in f:
return "{1}/{0}".format(file, d)
break
# To 'import' a script you wrote, map its filepath into the sys
def addPath(root, file): sys.path.append(os.path.abspath( findFile( './', file) ))Get Vital Signs Reference Table
file = 'VitalSignsCensus_ACS_Tables.xlsx'
xls = pd.ExcelFile(findFile('../', file))
acs_tables = pd.read_excel(xls, 'acs_tables')acs_tables.head()Get Tract/ CSA CrossWalk
file = 'csa2tractcrosswalk.csv'
crosswalk = pd.read_csv( findFile( '../', file) )
crosswalk = dict(zip(crosswalk['TRACTCE10'], crosswalk['CSA2010'] ) )Get retrieve_acs_data function
file = 'retrieve_acs_data.py'
addPath( '../../', file)
from retrieve_acs_data import retrieve_acs_data#File: retrieveAcsData.py
#Author: Charles Karpati
#Date: 1/9/19
#Section: Bnia
#Email: karpati1@umbc.edu
#Description:
#This file returns ACS data given an ID
#def main():
#purpose: Retrieves ACS data from the web
#input: ID
#output: Acs Data. Prints to ../../data/2_cleaned/acs/
import pandas as pd
import csv
from urllib.parse import urlencode
# This prevents timeouts
import socket
socket.setdefaulttimeout(10.0)
def retrieve_acs_data(year, tableId):
keys = []
vals = []
header = []
getTheseKeys = ''
getTheseKeys2 = ''
getTheseKeys3 = ''
getTheseKeys4 = ''
keyCount = 0
#~~~~~~~~~~~~~~~
# Step 1)
# Retrieve a Meta Data Table Describing the Content of the Table
#~~~~~~~~~~~~~~~
url = 'https://api.census.gov/data/20'+year+'/acs/acs5/groups/'+tableId+'.json'
print(url);
metaDataTable = pd.read_json(url, orient='records')
#~~~~~~~~~~~~~~~
# Step 2)
# Createa a Dictionary using the Meta Data Table
#~~~~~~~~~~~~~~~
# Multiple Queries may be Required.
# Max columns returned from any given query is 50.
# For that reasons bin the Columns into Groups of 50.
for key in metaDataTable['variables'].keys():
if key[-1:] == 'E':
keyCount = keyCount + 1
if keyCount < 50 : getTheseKeys = getTheseKeys+','+key
elif keyCount < 99 : getTheseKeys2 = getTheseKeys2+','+key
elif keyCount < 148 : getTheseKeys3 = getTheseKeys3+','+key
else: getTheseKeys4 = getTheseKeys4+','+key
keys.append(key)
val = metaDataTable['variables'][key]['label']
val = key+'_'+val.replace('Estimate!!', '').replace('!!', '_').replace(' ', '_')
vals.append(val)
dictionary = dict(zip(keys, vals))
#~~~~~~~~~~~~~~~
# Step 3)
# Get the actual data we want with all the columns (obtained using the meta data table)
#~~~~~~~~~~~~~~~
# https://api.census.gov/data/2016/acs/acs5?get=NAME,B11001_002E&for=county:005&in=state:24
urlRoot = 'https://api.census.gov/data/20'+year+'/acs/acs5?'
def getParams(keys): return {
'get': 'NAME'+keys,
'for': 'tract:*',
'in': 'state:24 county:510',
'key': '829bf6f2e037372acbba32ba5731647c5127fdb0'
}
def getBCityParams(keys): return {
'get': 'NAME'+keys,
'for': 'county:510',
'in': 'state:24',
'key': '829bf6f2e037372acbba32ba5731647c5127fdb0'
}
def readIn( url ):
tbl = pd.read_json(url, orient='records')
tbl.columns = tbl.iloc[0]
return tbl
def appendColumns( table, params):
# Get Tract and City Records For Specific Columns
table2 = readIn( urlRoot+urlencode(getParams(params)) )
table3 = readIn( urlRoot+urlencode(getBCityParams(params)) )
table3['tract'] = '010000'
# Concatenate the Records
table2.append([table2, table3], sort=False)
table2 = pd.concat([table2, table3], ignore_index=True)
# Merge to Master Table
table = pd.merge(table, table2, how='left', left_on=["NAME","state","county","tract"], right_on = ["NAME","state","county","tract"])
return table
# Get Tract Data
url = urlRoot+urlencode(getParams(getTheseKeys))
table = readIn(url)
table = table.iloc[1:]
# Get Baltimore City's Data .
url = urlRoot+urlencode(getBCityParams(getTheseKeys))
table2 = readIn(url)
table2 = table2[1:]
table2['tract'] = '010000'
#Append Baltimore to Tracts
#table = pd.concat([table, table2], keys=["NAME","state","county",], axis=0)
table.append([table, table2], sort=False)
table = pd.concat([table, table2], ignore_index=True)
if getTheseKeys2 != '' :
table = appendColumns(table, getTheseKeys2)
if getTheseKeys3 != '' :
table = appendColumns( table, getTheseKeys3 )
if getTheseKeys4 != '' :
table = appendColumns( table, getTheseKeys4 )
#~~~~~~~~~~~~~~~
# Step 4)
# Prepare Column Names using the meta data table. The raw data has columnsNames in the first row, as well.
# Replace column ID's with labels from the dictionary where applicable (should be always)
#~~~~~~~~~~~~~~~
for column in table.columns:
if column in keys: header.append(dictionary[column])
else: header.append(column)
header = [sub.replace(':', '') for sub in header]
print('HEADERS: ', header)
table.columns = header
#table.drop(table.index[0], inplace=True)
#~~~~~~~~~~~~~~~
# Step 5) Everything Else
#~~~~~~~~~~~~~~~
# Prettify Names
table['NAME'] = table['NAME'].str.replace(', Baltimore city, Maryland', '')
table['NAME'][table['NAME'] == 'Baltimore city, Maryland'] = 'Baltimore City'
# Convert to Integers Columns from Strings where Applicable
table = table.apply(pd.to_numeric, errors='ignore')
return tableColumn Operations
import csv # 'quote all'
def fixColNamesForCSV(x): return str(x)[:] if str(x) in ["NAME","state","county","tract", "CSA"] else str(x)[12:]ACS TOOL STEP 2 -> Execute :
Save the ACS Data
# Set Index df.set_index("NAME", inplace = True)
# Save raw to '../../data/3_outputs/acs/raw/'+year+'/'+tableId+'_'+description+'_5y'+year+'_est.csv'
# Tract to CSA df['CSA'] = df.apply(lambda row: crosswalk.get(int(row['tract']), "empty"), axis=1)
# Save 4 use '../../data/2_cleaned/acs/'+tableId+'_'+description+'_5y'+year+'_est.csv'
year = '19'
count = 0
startFrom = 0
# For each ACS Table
for x, row in acs_tables.iterrows():
count += 1
# Grab its Meta Data
description = str(acs_tables.loc[x, 'shortname'])
tableId = str(acs_tables.loc[x, 'id'])
yearExists = int(acs_tables.loc[x, year+'_exists'])
# If the Indicator is valid for the year
# use startFrom to being at a specific count
if yearExists and count >= startFrom:
print(str(count)+') '+tableId + ' ' + description)
# retrieve the Python ACS indicator
print('sending retrieve_acs_data', year, tableId)
df = retrieve_acs_data(year, tableId)
df.set_index("NAME", inplace = True)
# Save the Data as Raw
# We do not want the id in the column names
saveThis = df.rename( columns = lambda x : ( fixColNamesForCSV(x) ) )
saveThis.to_csv('./AcsDataRaw/'+tableId+'_'+description+'_5y'+year+'_est.csv', quoting=csv.QUOTE_ALL)
# Match Tract to CSA
df['CSA'] = df.apply(lambda row: crosswalk.get(int(row['tract']), "empty"), axis=1)
# Save the data (again) as Cleaned for me to use in the next scripts
df.to_csv('./AcsDataClean/'+tableId+'_5y'+year+'_est.csv', quoting=csv.QUOTE_ALL)