from google.colab import drive drive.mount('/content/drive')cd 'drive/My Drive/vitalSigns/vs_acs'ls

Import Modules & Construct Path Handlers

import os import sys import pandas as pd pd.set_option('display.expand_frame_repr', False) pd.set_option('display.precision', 2)# Find Relative Path to Files def findFile(root, file): for d, subD, f in os.walk(root): if file in f: return "{1}/{0}".format(file, d) break # To 'import' a script you wrote, map its filepath into the sys def addPath(root, file): sys.path.append(os.path.abspath( findFile( './', file) ))

Get Vital Signs Reference Table

file = 'VitalSignsCensus_ACS_Tables.xlsx' xls = pd.ExcelFile(findFile('../', file)) acs_tables = pd.read_excel(xls, 'acs_tables')acs_tables.head()

Get Tract/ CSA CrossWalk

file = 'csa2tractcrosswalk.csv' crosswalk = pd.read_csv( findFile( '../', file) ) crosswalk = dict(zip(crosswalk['TRACTCE10'], crosswalk['CSA2010'] ) )

Get retrieve_acs_data function

file = 'retrieve_acs_data.py' addPath( '../../', file) from retrieve_acs_data import retrieve_acs_data#File: retrieveAcsData.py #Author: Charles Karpati #Date: 1/9/19 #Section: Bnia #Email: karpati1@umbc.edu #Description: #This file returns ACS data given an ID #def main(): #purpose: Retrieves ACS data from the web #input: ID #output: Acs Data. Prints to ../../data/2_cleaned/acs/ import pandas as pd import csv from urllib.parse import urlencode # This prevents timeouts import socket socket.setdefaulttimeout(10.0) def retrieve_acs_data(year, tableId): keys = [] vals = [] header = [] getTheseKeys = '' getTheseKeys2 = '' getTheseKeys3 = '' getTheseKeys4 = '' keyCount = 0 #~~~~~~~~~~~~~~~ # Step 1) # Retrieve a Meta Data Table Describing the Content of the Table #~~~~~~~~~~~~~~~ url = 'https://api.census.gov/data/20'+year+'/acs/acs5/groups/'+tableId+'.json' print(url); metaDataTable = pd.read_json(url, orient='records') #~~~~~~~~~~~~~~~ # Step 2) # Createa a Dictionary using the Meta Data Table #~~~~~~~~~~~~~~~ # Multiple Queries may be Required. # Max columns returned from any given query is 50. # For that reasons bin the Columns into Groups of 50. for key in metaDataTable['variables'].keys(): if key[-1:] == 'E': keyCount = keyCount + 1 if keyCount < 50 : getTheseKeys = getTheseKeys+','+key elif keyCount < 99 : getTheseKeys2 = getTheseKeys2+','+key elif keyCount < 148 : getTheseKeys3 = getTheseKeys3+','+key else: getTheseKeys4 = getTheseKeys4+','+key keys.append(key) val = metaDataTable['variables'][key]['label'] val = key+'_'+val.replace('Estimate!!', '').replace('!!', '_').replace(' ', '_') vals.append(val) dictionary = dict(zip(keys, vals)) #~~~~~~~~~~~~~~~ # Step 3) # Get the actual data we want with all the columns (obtained using the meta data table) #~~~~~~~~~~~~~~~ # https://api.census.gov/data/2016/acs/acs5?get=NAME,B11001_002E&for=county:005&in=state:24 urlRoot = 'https://api.census.gov/data/20'+year+'/acs/acs5?' def getParams(keys): return { 'get': 'NAME'+keys, 'for': 'tract:*', 'in': 'state:24 county:510', 'key': '829bf6f2e037372acbba32ba5731647c5127fdb0' } def getBCityParams(keys): return { 'get': 'NAME'+keys, 'for': 'county:510', 'in': 'state:24', 'key': '829bf6f2e037372acbba32ba5731647c5127fdb0' } def readIn( url ): tbl = pd.read_json(url, orient='records') tbl.columns = tbl.iloc[0] return tbl def appendColumns( table, params): # Get Tract and City Records For Specific Columns table2 = readIn( urlRoot+urlencode(getParams(params)) ) table3 = readIn( urlRoot+urlencode(getBCityParams(params)) ) table3['tract'] = '010000' # Concatenate the Records table2.append([table2, table3], sort=False) table2 = pd.concat([table2, table3], ignore_index=True) # Merge to Master Table table = pd.merge(table, table2, how='left', left_on=["NAME","state","county","tract"], right_on = ["NAME","state","county","tract"]) return table # Get Tract Data url = urlRoot+urlencode(getParams(getTheseKeys)) table = readIn(url) table = table.iloc[1:] # Get Baltimore City's Data . url = urlRoot+urlencode(getBCityParams(getTheseKeys)) table2 = readIn(url) table2 = table2[1:] table2['tract'] = '010000' #Append Baltimore to Tracts #table = pd.concat([table, table2], keys=["NAME","state","county",], axis=0) table.append([table, table2], sort=False) table = pd.concat([table, table2], ignore_index=True) if getTheseKeys2 != '' : table = appendColumns(table, getTheseKeys2) if getTheseKeys3 != '' : table = appendColumns( table, getTheseKeys3 ) if getTheseKeys4 != '' : table = appendColumns( table, getTheseKeys4 ) #~~~~~~~~~~~~~~~ # Step 4) # Prepare Column Names using the meta data table. The raw data has columnsNames in the first row, as well. # Replace column ID's with labels from the dictionary where applicable (should be always) #~~~~~~~~~~~~~~~ for column in table.columns: if column in keys: header.append(dictionary[column]) else: header.append(column) header = [sub.replace(':', '') for sub in header] print('HEADERS: ', header) table.columns = header #table.drop(table.index[0], inplace=True) #~~~~~~~~~~~~~~~ # Step 5) Everything Else #~~~~~~~~~~~~~~~ # Prettify Names table['NAME'] = table['NAME'].str.replace(', Baltimore city, Maryland', '') table['NAME'][table['NAME'] == 'Baltimore city, Maryland'] = 'Baltimore City' # Convert to Integers Columns from Strings where Applicable table = table.apply(pd.to_numeric, errors='ignore') return table

Column Operations

import csv # 'quote all' def fixColNamesForCSV(x): return str(x)[:] if str(x) in ["NAME","state","county","tract", "CSA"] else str(x)[12:]

ACS TOOL STEP 2 -> Execute :

Save the ACS Data

# Set Index df.set_index("NAME", inplace = True) # Save raw to '../../data/3_outputs/acs/raw/'+year+'/'+tableId+'_'+description+'_5y'+year+'_est.csv' # Tract to CSA df['CSA'] = df.apply(lambda row: crosswalk.get(int(row['tract']), "empty"), axis=1) # Save 4 use '../../data/2_cleaned/acs/'+tableId+'_'+description+'_5y'+year+'_est.csv' year = '19' count = 0 startFrom = 0 # For each ACS Table for x, row in acs_tables.iterrows(): count += 1 # Grab its Meta Data description = str(acs_tables.loc[x, 'shortname']) tableId = str(acs_tables.loc[x, 'id']) yearExists = int(acs_tables.loc[x, year+'_exists']) # If the Indicator is valid for the year # use startFrom to being at a specific count if yearExists and count >= startFrom: print(str(count)+') '+tableId + ' ' + description) # retrieve the Python ACS indicator print('sending retrieve_acs_data', year, tableId) df = retrieve_acs_data(year, tableId) df.set_index("NAME", inplace = True) # Save the Data as Raw # We do not want the id in the column names saveThis = df.rename( columns = lambda x : ( fixColNamesForCSV(x) ) ) saveThis.to_csv('./AcsDataRaw/'+tableId+'_'+description+'_5y'+year+'_est.csv', quoting=csv.QUOTE_ALL) # Match Tract to CSA df['CSA'] = df.apply(lambda row: crosswalk.get(int(row['tract']), "empty"), axis=1) # Save the data (again) as Cleaned for me to use in the next scripts df.to_csv('./AcsDataClean/'+tableId+'_5y'+year+'_est.csv', quoting=csv.QUOTE_ALL)

SEARCH

CONNECT WITH US

DONATE

Help us keep this resource free and available to the public. Donate now!

Donate to BNIA-JFI

CONTACT US

Baltimore Neighborhood Indicators Alliance
The Jacob France Institute
1420 N. Charles Street, Baltimore, MD 21201
410-837-4377 | bnia-jfi@ubalt.edu