Awesome!
By this point you should be able to download a dataset, and crosswalk new columns onto it by matching on 'tract'
What we are going to do now is perform calculations using these newly created datasets.
Run the next few cells to create our calculatory functions
#@title Run This Cell: Misc Function Declarations # These functions right here are used in the calculations below. # Finds a column matchings a substring def getColName (df, col): return df.columns[df.columns.str.contains(pat = col)][0] def getColByName (df, col): return df[getColName(df, col)] # Pulls a column from one dataset into a new dataset. # This is not a crosswalk. calls getColByName() def addKey(df, fi, col): key = getColName(df, col) val = getColByName(df, col) fi[key] = val return fi # Return 0 if two specified columns are equal. def nullIfEqual(df, c1, c2): return df.apply(lambda x: x[getColName(df, c1)]+x[getColName(df, c2)] if x[getColName(df, c1)]+x[getColName(df, c2)] != 0 else 0, axis=1) # I'm thinking this doesnt need to be a function.. def sumInts(df): return df.sum(numeric_only=True)# @title Run This Cell : Create MHHI #File: mhhi.py #Author: Charles Karpati #Date: 1/24/19 #Section: Bnia #Email: karpati1@umbc.edu #Description: # Uses ACS Table B19001 - HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2016 INFLATION-ADJUSTED DOLLARS) # Universe: Households # Table Creates: hh25 hh40 hh60 hh75 hhm75, mhhi #purpose: Produce Sustainability - Percent of Population that Walks to Work Indicator #input: #output: import pandas as pd import glob def mhhi( df, columnsToInclude = [] ): #~~~~~~~~~~~~~~~ # Step 2) # Prepare the columns #~~~~~~~~~~~~~~~ info = pd.DataFrame( [ ['B19001_002E', 0, 10000], ['B19001_003E', 10000, 4999 ], ['B19001_004E', 15000, 4999 ], ['B19001_005E', 20000, 4999 ], ['B19001_006E', 25000, 4999 ], ['B19001_007E', 30000, 4999], ['B19001_008E', 35000, 4999 ], ['B19001_009E', 40000, 4999 ], ['B19001_010E', 45000, 4999 ], ['B19001_011E', 50000, 9999 ], ['B19001_012E', 60000, 14999], ['B19001_013E', 75000, 24999 ], ['B19001_014E', 100000, 24999 ], ['B19001_015E', 125000, 24999 ], ['B19001_016E', 150000, 49000 ], ['B19001_017E', 200000, 1000000000000000000000000 ], ], columns=['variable', 'lower', 'range'] ) # Final Dataframe data_table = pd.DataFrame() for index, row in info.iterrows(): data_table = addKey(df, data_table, row['variable']) # Accumulate totals accross the columns. # Midpoint: Divide column index 16 (the last column) of the cumulative totals temp_table = data_table.cumsum(axis=1) temp_table['midpoint'] = (temp_table.iloc[ : , -1 :] /2) # V3 temp_table['midpoint_index'] = False temp_table['midpoint_index_value'] = False # Z3 temp_table['midpoint_index_lower'] = False # W3 temp_table['midpoint_index_range'] = False # X3 temp_table['midpoint_index_minus_one_cumulative_sum'] = False #Y3 # step 3 - csa_agg3: get the midpoint index by "when midpoint > agg[1] and midpoint <= agg[2] then 2" # Get CSA Midpoint Index using the breakpoints in our info table. for index, row in temp_table.iterrows(): # Get the index of the first column where our midpoint is greater than the columns value. midpoint = row['midpoint'] midpoint_index = 0 # For each column (except the 6 columns we just created) # The tracts midpoint was < than the first tracts value at column 'B19001_002E_Total_Less_than_$10,000' if( midpoint < int(row[0]) or row[-6] == False ): temp_table.loc[ index, 'midpoint_index' ] = 0 else: for column in row.iloc[:-6]: # set midpoint index to the column with the highest value possible that is under midpoint if( midpoint >= int(column) ): if midpoint==False: print (str(column) + ' - ' + str(midpoint)) temp_table.loc[ index, 'midpoint_index' ] = midpoint_index +1 midpoint_index += 1 # temp_table = temp_table.drop('Unassigned--Jail') for index, row in temp_table.iterrows(): temp_table.loc[ index, 'midpoint_index_value' ] = data_table.loc[ index, data_table.columns[row['midpoint_index']] ] temp_table.loc[ index, 'midpoint_index_lower' ] = info.loc[ row['midpoint_index'] ]['lower'] temp_table.loc[ index, 'midpoint_index_range' ] = info.loc[ row['midpoint_index'] ]['range'] temp_table.loc[ index, 'midpoint_index_minus_one_cumulative_sum'] = row[ row['midpoint_index']-1 ] # This is our denominator, which cant be negative. for index, row in temp_table.iterrows(): if row['midpoint_index_value']==False: temp_table.at[index, 'midpoint_index_value']=1; #~~~~~~~~~~~~~~~ # Step 3) # Run the Calculation # Calculation = (midpoint_lower::numeric + (midpoint_range::numeric * ( (midpoint - midpoint_upto_agg) / nullif(midpoint_total,0) # Calculation = W3+X3*((V3-Y3)/Z3) # v3 -> 1 - midpoint of households == sum / 2 # w3 -> 2 - lower limit of the income range containing the midpoint of the housing total == row[lower] # x3 -> width of the interval containing the medium == row[range] # z3 -> number of hhs within the interval containing the median == row[total] # y3 -> 4 - cumulative frequency up to, but no==NOT including the median interval #~~~~~~~~~~~~~~~ def finalCalc(x): return ( x['midpoint_index_lower']+ x['midpoint_index_range']*( ( x['midpoint']-x['midpoint_index_minus_one_cumulative_sum'])/ x['midpoint_index_value'] ) ) temp_table['final'] = temp_table.apply(lambda x: finalCalc(x), axis=1) columnsToInclude.append('tract') print ('INCLUDING COLUMN(s):' + str(columnsToInclude)) temp_table[columnsToInclude] = df[columnsToInclude] #~~~~~~~~~~~~~~~ # Step 4) # Add Special Baltimore City Data #~~~~~~~~~~~~~~~ # url = 'https://api.census.gov/data/20'+str(year)+'/acs/acs5/subject?get=NAME,S1901_C01_012E&for=county%3A510&in=state%3A24&key=829bf6f2e037372acbba32ba5731647c5127fdb0' # table = pd.read_json(url, orient='records') # temp_table['final']['Baltimore City'] = float(table.loc[1, table.columns[1]]) return temp_table#@title Run This Cell: Create trav45 #File: trav45.py #Author: Charles Karpati #Date: 1/17/19 #Section: Bnia #Email: karpati1@umbc.edu #Description: # Uses ACS Table B08303 - TRAVEL TIME TO WORK, # (Universe: Workers 16 years and over who did not work at home) # Table Creates: trav14, trav29, trav44, trav45 #purpose: Produce Sustainability - Percent of Employed Population with Travel Time to Work of 45 Minutes and Over Indicator #input: #output: import pandas as pd import glob def trav45(df, columnsToInclude = [] ): #~~~~~~~~~~~~~~~ # Step 2) # Prepare the columns #~~~~~~~~~~~~~~~ # Final Dataframe fi = pd.DataFrame() columns = ['B08303_011E','B08303_012E','B08303_013E','B08303_001E', 'tract'] columns.extend(columnsToInclude) for col in columns: fi = addKey(df, fi, col) # Numerators numerators = pd.DataFrame() columns = ['B08303_011E','B08303_012E','B08303_013E'] for col in columns: numerators = addKey(df, numerators, col) # Denominators denominators = pd.DataFrame() columns = ['B08303_001E'] for col in columns: denominators = addKey(df, denominators, col) # construct the denominator, returns 0 iff the other two rows are equal. #~~~~~~~~~~~~~~~ # Step 3) # Run the Calculation # ( (value[1] + value[2] + value[3] ) / nullif(value[4],0) )*100 #~~~~~~~~~~~~~~~ fi['numerator'] = numerators.sum(axis=1) fi['denominator'] = denominators.sum(axis=1) fi = fi[fi['denominator'] != 0] # Delete Rows where the 'denominator' column is 0 fi['final'] = (fi['numerator'] / fi['denominator'] ) * 100 return fi#@title Run This Cell: Create trav44 #File: trav44.py #Author: Charles Karpati #Date: 1/17/19 #Section: Bnia #Email: karpati1@umbc.edu #Description: # Uses ACS Table B08303 - TRAVEL TIME TO WORK, # (Universe: Workers 16 years and over who did not work at home) # Table Creates: trav14, trav29, trav44, trav45 #purpose: Produce Sustainability - Percent of Employed Population with Travel Time to Work of 30-44 Minutes Indicator #input: #output: import pandas as pd import glob def trav44( df, columnsToInclude = [] ): #~~~~~~~~~~~~~~~ # Step 2) # Prepare the columns #~~~~~~~~~~~~~~~ # Final Dataframe fi = pd.DataFrame() columns = ['B08303_008E','B08303_009E','B08303_010E','B08303_001E', 'tract'] columns.extend(columnsToInclude) for col in columns: fi = addKey(df, fi, col) # Numerators numerators = pd.DataFrame() columns = ['B08303_008E','B08303_009E','B08303_010E'] for col in columns: numerators = addKey(df, numerators, col) # Denominators denominators = pd.DataFrame() columns = ['B08303_001E'] for col in columns: denominators = addKey(df, denominators, col) # construct the denominator, returns 0 iff the other two rows are equal. #~~~~~~~~~~~~~~~ # Step 3) # Run the Calculation # ( (value[1] + value[2] + value[3] ) / nullif(value[4],0) )*100 #~~~~~~~~~~~~~~~ fi['numerator'] = numerators.sum(axis=1) fi['denominator'] = denominators.sum(axis=1) fi = fi[fi['denominator'] != 0] # Delete Rows where the 'denominator' column is 0 fi['final'] = (fi['numerator'] / fi['denominator'] ) * 100 return fi #@title Run This Cell: Create affordr #File: affordr.py #Author: Charles Karpati #Date: 1/17/19 #Section: Bnia #Email: karpati1@umbc.edu #Description: # Uses ACS Table B25070 - GROSS RENT AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS # Universe: Renter-occupied housing units #purpose: Produce Housing and Community Development - Affordability Index - Rent Indicator #input: #output: import pandas as pd import glob def affordr( df, columnsToInclude ): #~~~~~~~~~~~~~~~ # Step 2) # Prepare the columns #~~~~~~~~~~~~~~~ # Final Dataframe fi = pd.DataFrame() columns = ['B25070_007E','B25070_008E','B25070_009E','B25070_010E','B25070_001E', 'tract'] columns.extend(columnsToInclude) for col in columns: fi = addKey(df, fi, col) # Numerators numerators = pd.DataFrame() columns = ['B25070_007E','B25070_008E','B25070_009E','B25070_010E'] for col in columns: numerators = addKey(df, numerators, col) # Denominators denominators = pd.DataFrame() columns = ['B25070_001E'] for col in columns: denominators = addKey(df, denominators, col) # construct the denominator, returns 0 iff the other two rows are equal. #~~~~~~~~~~~~~~~ # Step 3) # Run the Calculation # ( (value[1]+value[2]+value[3]+value[4]) / nullif(value[5],0) )*100 #~~~~~~~~~~~~~~~ fi['numerator'] = numerators.sum(axis=1) fi['denominator'] = denominators.sum(axis=1) fi = fi[fi['denominator'] != 0] # Delete Rows where the 'denominator' column is 0 fi['final'] = (fi['numerator'] / fi['denominator'] ) * 100 return fi#@title Run This Cell: Create affordm #File: affordm.py #Author: Charles Karpati #Date: 1/25/19 #Section: Bnia #Email: karpati1@umbc.edu #Description: # Uses ACS Table B25091 - MORTGAGE STATUS BY SELECTED MONTHLY OWNER COSTS AS A PERCENTAGE OF HOUSEHOLD INCOME IN THE PAST 12 MONTHS # Universe: Owner-occupied housing units # Table Creates: #purpose: Produce Housing and Community Development - Affordability Index - Mortgage Indicator #input: #output: import pandas as pd import glob def affordm( df, columnsToInclude ): #~~~~~~~~~~~~~~~ # Step 1) # Prepare the columns #~~~~~~~~~~~~~~~ # Final Dataframe fi = pd.DataFrame() columns = ['B25091_008E','B25091_009E','B25091_010E','B25091_011E','B25091_002E', 'tract'] columns.extend(columnsToInclude) for col in columns: fi = addKey(df, fi, col) # Numerators numerators = pd.DataFrame() columns = ['B25091_008E','B25091_009E','B25091_010E','B25091_011E'] for col in columns: numerators = addKey(df, numerators, col) # Denominators denominators = pd.DataFrame() columns = ['B25091_002E'] for col in columns: denominators = addKey(df, denominators, col) # construct the denominator, returns 0 iff the other two rows are equal. #~~~~~~~~~~~~~~~ # Step 3) # Run the Calculation # ( (value[1]+value[2]+value[3]+value[4]) / nullif(value[5],0) )*100 #~~~~~~~~~~~~~~~ fi['numerator'] = numerators.sum(axis=1) fi['denominator'] = denominators.sum(axis=1) fi = fi[fi['denominator'] != 0] # Delete Rows where the 'denominator' column is 0 fi['final'] = (fi['numerator'] / fi['denominator'] ) * 100 return fi#@title Run This Cell: Create age5 #File: age5.py #Author: Charles Karpati #Date: 4/16/19 #Section: Bnia #Email: karpati1@umbc.edu #Description: # Uses ACS Table B01001 - SEX BY AGE # Universe: Total population # Table Creates: tpop, female, male, age5 age18 age24 age64 age65 #purpose: #input: #output: import pandas as pd import glob def age5( df, columnsToInclude ): #~~~~~~~~~~~~~~~ # Step 1) # Prepare the columns #~~~~~~~~~~~~~~~ # Final Dataframe fi = pd.DataFrame() columns = ['B01001_027E_Total_Female_Under_5_years', 'B01001_003E_Total_Male_Under_5_years', 'B01001_001E_Total' , 'tract'] columns.extend(columnsToInclude) for col in columns: fi = addKey(df, fi, col) # Under 5 fi['final'] = ( df[ 'B01001_003E_Total_Male_Under_5_years' ] + df[ 'B01001_027E_Total_Female_Under_5_years' ] ) / df['B01001_001E_Total'] * 100 return fi#@title Run This Cell: age65 import pandas as pd import glob def age65( df, columnsToInclude ): #~~~~~~~~~~~~~~~ # Step 1) # Prepare the columns #~~~~~~~~~~~~~~~ # Final Dataframe fi = pd.DataFrame() columns = df.filter(regex='001E|020E|021E|022E|023E|024E|025E|044E|045E|046E|047E|048E|049E').columns.values columns = numpy.append(columns, columnsToInclude) for col in columns: fi = addKey(df, fi, col) # print('COLS ') # print(df.columns) print(' ') # over 65 fi['age65'] = ( df.filter(regex='020E|021E|022E|023E|024E|025E|044E|045E|046E|047E|048E|049E').sum(axis=1) ) / df['B01001_001E_Total:'] * 100 return fi#@title Run This Cell: age18 #File: age18.py #Author: Charles Karpati #Date: 4/16/19 #Section: Bnia #Email: karpati1@umbc.edu #Description: # Uses ACS Table B01001 - SEX BY AGE # Universe: Total population # Table Creates: tpop, female, male, age5 age18 age24 age64 age65 #purpose: #input: #output: import pandas as pd import glob def age18( df, columnsToInclude ): print(df.columns) #~~~~~~~~~~~~~~~ # Step 1) # Prepare the columns #~~~~~~~~~~~~~~~ # Final Dataframe fi = pd.DataFrame() columns = ['B01001_001E_Total', 'B01001_004E_Total_Male_5_to_9_years', 'B01001_005E_Total_Male_10_to_14_years' , 'B01001_006E_Total_Male_15_to_17_years', 'B01001_028E_Total_Female_5_to_9_years', 'B01001_029E_Total_Female_10_to_14_years' , 'B01001_030E_Total_Female_15_to_17_years'] columns = df.filter(regex='001E|004E|005E|006E|028E|029E|030E').columns.values columns = numpy.append(columns, columnsToInclude) for col in columns: fi = addKey(df, fi, col) print(' ') # Under 5 fi['age18'] = ( df.filter(regex='004E|005E|006E|028E|029E|030E').sum(axis=1) ) / df['B01001_001E_Total:'] * 100 return fi#@title Run This Cell: paa import pandas as pd import glob def paa( df, columnsToInclude ): print(df.columns) #~~~~~~~~~~~~~~~ # Step 1) # Prepare the columns #~~~~~~~~~~~~~~~ # Final Dataframe fi = pd.DataFrame() columns = ['B03002_001E_Total:', 'B03002_004E_Total_Not_Hispanic_or_Latino_Black_or_African_American_alone'] columns = df.filter(regex='001E|004E').columns.values columns = numpy.append(columns, columnsToInclude) for col in columns: print('addKey df',df.columns,'fi',fi.columns,'col: ', col) fi = addKey(df, fi, col) print(' ') # Calculate fi['paa'] = ( df.filter(regex='004E').sum(axis=1) ) / df['B03002_001E_Total:'] * 100 return fi#@title Run This Cell: hisp import pandas as pd import glob def hisp( df, columnsToInclude ): print(df.columns) #~~~~~~~~~~~~~~~ # Step 1) # Prepare the columns #~~~~~~~~~~~~~~~ # Final Dataframe fi = pd.DataFrame() columns = ['B03002_001E_Total', 'B03002_012E_Total_Hispanic_or_Latino'] columns = df.filter(regex='001E|012E').columns.values columns = numpy.append(columns, columnsToInclude) for col in columns: print('addKey df',df.columns,'fi',fi.columns,'col: ', col) fi = addKey(df, fi, col) print(' ') # Calculate fi['hisp'] = ( df.filter(regex='012E').sum(axis=1) ) / df['B03002_001E_Total:'] * 100 return fi#@title Run This Cell: pwhite import pandas as pd import glob def pwhite( df, columnsToInclude ): print(df.columns) #~~~~~~~~~~~~~~~ # Step 1) # Prepare the columns #~~~~~~~~~~~~~~~ # Final Dataframe fi = pd.DataFrame() columns = ['B03002_001E_Total', 'B03002_003E_Total_Not_Hispanic_or_Latino_White_alone'] columns = df.filter(regex='001E|003E').columns.values columns = numpy.append(columns, columnsToInclude) for col in columns: print('addKey df',df.columns,'fi',fi.columns,'col: ', col) fi = addKey(df, fi, col) print(' ') # Calculate fi['pwhite'] = ( df.filter(regex='003E').sum(axis=1) ) / df['B03002_001E_Total:'] * 100 return fi#@title Run This Cell: hh25inc import pandas as pd import glob def hh25inc( df, columnsToInclude ): # df.columns = df.columns.str.replace(r"[,]", "") df.columns = df.columns.str.replace(r"[$]", "") print(df.columns) #~~~~~~~~~~~~~~~ # Step 1) # Prepare the columns #~~~~~~~~~~~~~~~ # Final Dataframe fi = pd.DataFrame() columns = ['B19001_001E_Total', "B19001_002E_Total_Less_than_10,000", "B19001_003E_Total_10,000_to_14,999", "B19001_004E_Total_15,000_to_19,999", "B19001_005E_Total_20,000_to_24,999"] columns = df.filter(regex='002E|003E|004E|005E|001E').columns.values columns = numpy.append(columns, columnsToInclude) for col in columns: print('addKey col: ', col, df.columns) fi = addKey(df, fi, col) # Calculate fi['hh25inc'] = ( df.filter(regex='002E|003E|004E|005E').sum(axis=1) ) / df['B19001_001E_Total:'] * 100 return fi#@title Run This Cell: novhcl import pandas as pd import glob def novhcl( df, columnsToInclude ): print(df.columns) #~~~~~~~~~~~~~~~ # Step 1) # Prepare the columns #~~~~~~~~~~~~~~~ # Final Dataframe fi = pd.DataFrame() columns = ['B08201_002E_Total_No_vehicle_available','B08201_001E_Total'] columns = df.filter(regex='002E|003E|004E|005E|001E').columns.values columns = numpy.append(columns, columnsToInclude) for col in columns: print('addKey df',df.columns,'fi',fi.columns,'col: ', col) fi = addKey(df, fi, col) print(' ') # Calculate fi['novhcl'] = ( df.filter(regex='002E').sum(axis=1) ) / df['B08201_001E_Total:'] * 100 return fi#@title Run This Cell: nohhint import pandas as pd import glob def nohhint( df, columnsToInclude ): print(df.columns) #~~~~~~~~~~~~~~~ # Step 1) # Prepare the columns #~~~~~~~~~~~~~~~ # Final Dataframe fi = pd.DataFrame() columns = ['B28011_001E_Total', 'B28011_002E_Total_With_an_Internet_subscription', 'B28011_003E_Total_With_an_Internet_subscription_Dial-up_alone', 'B28011_004E_Total_With_an_Internet_subscription_Broadband_such_as_cable,_fiber_optic,_or_DSL', 'B28011_005E_Total_With_an_Internet_subscription_Satellite_Internet_service', 'B28011_006E_Total_With_an_Internet_subscription_Other_service', 'B28011_007E_Total_Internet_access_without_a_subscription', 'B28011_008E_Total_No_Internet_access'] columns = df.filter(regex='008E|001E').columns.values columns = numpy.append(columns, columnsToInclude) for col in columns: print('addKey df',df.columns,'col: ', col) fi = addKey(df, fi, col) print(' ') # Calculate fi['nohhint'] = ( df.filter(regex='008E').sum(axis=1) ) / df['B28011_001E_Total:'] * 100 return fiNow that our calculations have been created, lets: