#default_exp rbintel

This colab and more can be found at https://github.com/BNIA/vitalsigns.

Whats Inside?:

Indicators Used

  • ✅ 30 - dom - (RBIntel) Median Number of Days on the Market
  • ✅ 38 - cashsa - (RBIntel) Percentage of residential sales for cash
  • ✅ 39 - reosa - (RBIntel) percentage of residential sales in foreclosure (REO)

Datasets Used

  • ✔️ housing.rbintelregion_201X (30-dom, 38-cashsa, 39-reosa -> D̶a̶y̶s̶O̶n̶M̶a̶r̶k, BuyerFinan newtrust1l, foreclosur)
year = '18'

SETUP Enviornment:

Import Modules

! pip install -U -q PyDrive ! pip install geopy ! pip install geopandas ! pip install geoplot ! pip install dataplay ! pip install matplotlib ! pip install psycopg2-binary! apt-get install build-dep python-psycopg2 ! apt-get install libpq-dev ! apt-get install libspatialindex-dev!pip install rtree !pip install dexplotfrom dataplay.geoms import workWithGeometryData%%capture # These imports will handle everything import os import sys import csv import psycopg2 import pyproj from pyproj import Proj, transform # conda install -c conda-forge proj4 from shapely.geometry import Point from shapely import wkb from shapely.wkt import loads # https://pypi.org/project/geopy/ from geopy.geocoders import Nominatim # In case file is KML, enable support import fiona fiona.drvsupport.supported_drivers['kml'] = 'rw' fiona.drvsupport.supported_drivers['KML'] = 'rw'from IPython.display import clear_output clear_output(wait=True)import ipywidgets as widgets from ipywidgets import interact, interact_manual#export import numpy as np import pandas as pd # import geopandas import geopandas as gpd # from geopandas import GeoDataFrame

Configure Enviornment

# This will just beautify the output pd.set_option('display.expand_frame_repr', False) pd.set_option('display.precision', 2) from IPython.core.interactiveshell import InteractiveShell InteractiveShell.ast_node_interactivity = "all" # pd.set_option('display.expand_frame_repr', False) # pd.set_option('display.precision', 2) # pd.reset_option('max_colwidth') pd.set_option('max_colwidth', 20) # pd.reset_option('max_colwidth')

Prep Datasets

TPOP CSA and Baltimore

Get Baltimore

Click to toggle csa = "https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Tpop/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson" csa = gpd.read_file(csa); csa.head(1)

Get CSA

url2 = "https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Tpop/FeatureServer/1/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson" csa2 = gpd.read_file(url2); csa2['CSA2010'] = csa2['City_1'] csa2['OBJECTID'] = 56 csa2 = csa2.drop(columns=['City_1']) csa2.head()

Append do no append Bcity. We put it on the Bottom of the df because when performing the ponp it returns only the last matching columns CSA Label.

# csa = pd.concat([csa2, csa], ignore_index=True) csa = csa.append(csa2).reset_index(drop=True)csa.head(3)csa.tail(3)csa.head()csa.drop(columns=['Shape__Area', 'Shape__Length', 'OBJECTID'], axis=1).to_file("BCity_and_CSA.geojson", driver='GeoJSON')

Labeled Points - Analysis.

original = gpd.read_file("RBIntel_20"+year+"_BaltRegion_newfields_CSA_City.shp"); original.rename(columns={ 'CSA':'CSA2010', 'BaltCity':'InBaltimore'}, inplace=True) df = original[ original['CSA2010'].notnull() | original['InBaltimore'].notnull() ] print( 'Total # Rows: ', original.shape[0] ) # rows, columns print( '# Before | After') print( '# Where BCity.isnull/notnull: ', original.InBaltimore.isnull().sum(), '|', original.InBaltimore.notnull().sum() ); print( '# where CSA2010.isnull/notnull: ', original.CSA2010.isnull().sum(), '|', original.CSA2010.notnull().sum() ); print( '# Where CSA and/or Baltimore lbl Exists: ', df.shape[0])df.describe().to_csv('18_UnfilteredOnForeDescriptions.csv')df[ df['Foreclosur'].str.contains('.Y.|.Y|Y.|Y', regex=True, na=False) ].describe().to_csv('18_filteredOnForeDescriptions.csv')df['count'] = 1df.groupby('CSA2010').sum(numeric_only=True)['count'].to_csv('18_UnfilteredOnFore.csv')df[ df['Foreclosur'].str.contains('.Y.|.Y|Y.|Y', regex=True, na=False) ].groupby('CSA2010').sum(numeric_only=True)['count'].to_csv('18_FilteredOnFore.csv')# from VitalSigns.utils import * # df = check_labels(original)df.CSA2010 = df.CSA2010.fillna('Baltimore City')df.CSA2010.unique()

You need to run the indicator functions at least once before running this. Also just doesn't work atm.

VS Indicator Functions

Preview

original.shape[0] print(' ') df.shape[0] print(' ') df[ df['DaysonMark'] > 0].shape[0] print(' ') df[ df['Foreclosur'].str.contains('.Y.|.Y|Y.|Y', regex=True, na=False) ].shape[0]

NewTrust1L is dead. long live BuyerFinan

df['Foreclosur'].value_counts()df['BuyerFinan'].value_counts()df[ df['DaysonMark'] > 0].shape[0]df[ df['DaysonMark'] > 0][['CSA2010','DaysonMark']].groupby('CSA2010').agg(['median', 'count']).head(4)

DOM 30

# https://services1.arcgis.com/mVFRs7NF4iFitgbY/arcgis/rest/services/dom/FeatureServer/layers # https://bniajfi.org/indicators/Housing%20And%20Community%20Development/dom lbl = 'Median Number of Days on the Market' TopicArea = 'Housing And Community Development' YearsAvailable = '2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020' long_Description: """ The median number of days that homes listed for sale sit on the public market in a given area. This time period is from the date it is listed for sale till the day the contract of sale is signed. Private (non-listed) home sale transactions are not included in this indicator. The median days on market is used as opposed to the average so that both extremely high and extremely low days on the market do not distort the length of time for which homes are listed on the market. """

Used to be we'd calc our own DOM field but RBIntel added it as a field

#export def vsDom(df, yr): id = '30' shortname = 'dom' fincol = id+'-'+shortname+year # Create the Numerator and Denominator numer = df[['DaysonMark','CSA2010']].copy() # Filter Em numer = numer[ numer['DaysonMark'] > 0] print( numer.shape[0] ) # Get Bcity Val bCityVal = numer.median(numeric_only=True)['DaysonMark'] # Group by CSA numer = numer.groupby('CSA2010').median(numeric_only=True) # use .median to calculate DOM. # Make sure ALL csas and BaltimoreCity are included and sorted. numer = csa.merge( numer, left_on='CSA2010', right_on='CSA2010', how='outer' ) numer.drop( columns=['geometry', 'Shape__Length','Shape__Area', 'OBJECTID', 'tpop10'], inplace=True) # Bcity is the median of all the records an not the community medians. # Incorrect Bcity median IFF Groupby keeps a 'False' row (index 56) numer.at[55,'DaysonMark']= bCityVal # Perform the calculation numer[fincol] = numer['DaysonMark'] compareYears = gpd.read_file("https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/"+shortname.capitalize()+"/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson"); goback = 2 if year == '19' else 3 if year == '20' else 1 prevYear = shortname + str( int(year) - goback ) if prevYear in compareYears.columns: numer = numer.merge( compareYears[['CSA2010', prevYear]], left_on='CSA2010', right_on='CSA2010', how='outer' ) numer['change'] = numer[id+'-'+shortname+year] - numer[ prevYear ] numer['percentChange'] = numer['change' ] / numer[ prevYear ] * 100 numer['change'] = numer['change'].apply(lambda x: "{:.2f}".format(x) ) print( 'Records Matching Query: ', numer.size / len(numer.columns) ) return numert = vsDom(df,year) t.head(2) t.tail(2) t.to_csv('30_dom_'+year+'.csv', index=False)

CASHSA 38

NewTrust1L is dead. long live BuyerFinan

# https://services1.arcgis.com/mVFRs7NF4iFitgbY/arcgis/rest/services/cashsa/FeatureServer/layers # https://bniajfi.org/indicators/Housing%20And%20Community%20Development/cashsa lbl = 'Percentage of Residential Sales for Cash' TopicArea = 'Housing And Community Development' YearsAvailable = '2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018' long_Description: """ The percent of homes and condominiums sold for cash out of all residential properties sold in a calendar year. These types of sales tend to signify investor-based purchases as homes purchased for cash either become rental properties or later sold again in an effort to generate a profit. """ original_sql = = """ with numerator AS ( select (sum( case when newtrust1l = $$Cash$$ then 1 else 0 end)::numeric) as result, csa from vital_signs.match_csas_and_bc_by_geom('housing.rbintelregion_2017', 'gid', 'the_geom') a left join housing.rbintelregion_2017 b on a.gid = b.gid group by csa ), denominator AS ( select (sum( case when csa_present then 1 else NULL end)::numeric ) as result, csa from vital_signs.match_csas_and_bc_by_geom('housing.rbintelregion_2017', 'gid', 'the_geom') a left join housing.rbintelregion_2017 b on a.gid = b.gid group by csa, the_pop ), tbl AS ( select denominator.csa,(numerator.result / denominator.result)*(100::numeric) as result from numerator left join denominator on numerator.csa = denominator.csa ) select * from tbl where 1 = 1 ORDER BY csa ASC; """#export def cashsa(df, yr): id = '38' shortname = 'cashsa' fincol = id+'-'+shortname+year # Create the Numerator and Denominator numer = df[['BuyerFinan','CSA2010']].copy() numer['count'] = 1 denom = numer.copy() # Filter Em numer = numer[ numer['BuyerFinan'].str.contains('.Cash.|.Cash|Cash.|Cash', regex=True, na=False) ] print("LENGTH AFTER FILTER: ", len(numer) ) # Get Bcity Val bCityVal = numer.sum(numeric_only=True)['count'] bCityValDenom = denom.sum(numeric_only=True)['count'] # Group by CSA numer = numer.groupby('CSA2010').sum(numeric_only=True) denom = denom.groupby('CSA2010').sum(numeric_only=True) # Make sure ALL csas and BaltimoreCity are included and sorted. numer = csa.merge( numer, left_on='CSA2010', right_on='CSA2010', how='outer' ) numer.drop( columns=['geometry', 'Shape__Length','Shape__Area', 'OBJECTID', 'tpop10'], inplace=True) denom = csa.merge( denom, left_on='CSA2010', right_on='CSA2010', how='outer' ) denom.drop( columns=['geometry', 'Shape__Length','Shape__Area', 'OBJECTID', 'tpop10'], inplace=True) # Bcity is the sum of the community sums. # Incorrect Bcity Sum IFF Groupby keeps a 'False' row (index 56) numer.at[55,'count']= bCityVal denom.at[55,'count']= bCityValDenom # Perform the calculation numer[fincol] = numer['count'] / denom['count'] * 100 compareYears = gpd.read_file("https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/"+shortname.capitalize()+"/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson"); goback = 2 if year == '19' else 3 if year == '20' else 1 prevYear = shortname + str( int(year) - goback ) if prevYear in compareYears.columns: numer = numer.merge( compareYears[['CSA2010', prevYear]], left_on='CSA2010', right_on='CSA2010', how='outer' ) numer['change'] = numer[id+'-'+shortname+year] - numer[ prevYear ] numer['percentChange'] = numer['change' ] / numer[ prevYear ] * 100 numer['change'] = numer['change'].apply(lambda x: "{:.2f}".format(x) ) print( 'Records Matching Query: ', numer.size / len(numer.columns) ) return numerresp = cashsa(df, year) resp.head(2) resp.tail(2) cashsa(df,year).to_csv('38_cashsa_'+year+'.csv', index=False)

REOSA 39

# https://services1.arcgis.com/mVFRs7NF4iFitgbY/arcgis/rest/services/reosa/FeatureServer/layers # https://bniajfi.org/indicators/Housing%20And%20Community%20Development/reosa/2017 lbl = 'Percentage of Residential Sales in Foreclosure (REO)' TopicArea = 'Housing And Community Development' YearsAvailable = '2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020' long_Description: """ The portion of the homes and condominiums sold that were identified as being owned by the bank (REO) out of all residential properties sold in a calendar year. """ original_sql = """ with numerator AS ( select (sum( case when foreclosur = $$Y$$ then 1 else 0 end)::numeric) as result, csa from vital_signs.match_csas_and_bc_by_geom('housing.rbintelregion_2017', 'gid', 'the_geom') a left join housing.rbintelregion_2017 b on a.gid = b.gid group by csa ), denominator AS ( select (sum( case when csa_present then 1 else NULL end)::numeric ) as result, csa from vital_signs.match_csas_and_bc_by_geom('housing.rbintelregion_2017', 'gid', 'the_geom') a left join housing.rbintelregion_2016 b on a.gid = b.gid group by csa, the_pop ), tbl AS ( select denominator.csa,(numerator.result / denominator.result)*(100::numeric) as result from numerator left join denominator on numerator.csa = denominator.csa ) select * from tbl where 1 = 1 ORDER BY csa ASC; """#export def reosa(df, yr): id = '39' shortname = 'reosa' fincol = id+'-'+shortname+year # Create the Numerator and Denominator numer = df[['Foreclosur','CSA2010']].copy() numer['count'] = 1 denom = numer.copy() denom['count'] = 1 # Filter Em numer = numer[ numer['Foreclosur'].str.contains('.Y.|.Y|Y.|Y', regex=True, na=False) ] print( numer['Foreclosur'].value_counts() ) # Get Bcity Val bCityVal = numer.sum(numeric_only=True)['count'] bCityValDenom = denom.sum(numeric_only=True)['count'] # Group by CSA numer = numer.groupby('CSA2010').sum(numeric_only=True) denom = denom.groupby('CSA2010').sum(numeric_only=True) # Make sure ALL csas and BaltimoreCity are included and sorted. numer = csa.merge( numer, left_on='CSA2010', right_on='CSA2010', how='outer' ) numer.drop( columns=['geometry', 'Shape__Length','Shape__Area', 'OBJECTID', 'tpop10'], inplace=True) denom = csa.merge( denom, left_on='CSA2010', right_on='CSA2010', how='outer' ) denom.drop( columns=['geometry', 'Shape__Length','Shape__Area', 'OBJECTID', 'tpop10'], inplace=True) # Bcity is the sum of the community sums. # Incorrect Bcity Sum IFF Groupby keeps a 'False' row (index 56) numer.at[55,'count']= bCityVal denom.at[55,'count']= bCityValDenom # Perform the calculation numer['denomCount'] = denom['count'] numer[fincol] = numer['count'] / numer['denomCount'] * 100 compareYears = gpd.read_file("https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/"+shortname.capitalize()+"/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson"); goback = 2 if year == '19' else 3 if year == '20' else 1 prevYear = shortname + str( int(year) - goback ) if prevYear in compareYears.columns: numer = numer.merge( compareYears[['CSA2010', prevYear]], left_on='CSA2010', right_on='CSA2010', how='outer' ) numer['change'] = numer[id+'-'+shortname+year] - numer[ prevYear ] numer['percentChange'] = numer['change' ] / numer[ prevYear ] * 100 numer['change'] = numer['change'].apply(lambda x: "{:.2f}".format(x) ) print( 'Records Matching Query: ', numer.size / len(numer.columns) ) return numerresp = reosa(df, year) resp.head() resp.tail() resp.to_csv('39_reosa_'+year+'.csv', index=False)

OLD_FNS

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html # https://stackoverflow.com/questions/41308763/python-pandas-df-duplicated-and-df-drop-duplicated-not-finding-all-duplicates def exploreDs(df, yr): def createIndicatorAndPlotChoropleth(ddf, txt1): fig, ax = plt.subplots(1, 1) csa.merge( vsDom(df, 'DOM_'+txt1+yr) , left_on='CSA2010', right_on='CSA2010' ).plot(column='dom', ax=ax, legend=True); plt.savefig('./output/img/DOM_Map_Of_the_'+txt1+yr+'.jpg') csa.merge( vsCashsa(df, 'Cashsa_'+txt1+yr) , left_on='CSA2010', right_on='CSA2010' ).plot(column='cashsa', ax=ax, legend=True); plt.savefig('./output/img/Cashsa_Map_Of_the_'+txt1+yr+'.jpg') csa.merge( vsReosa(df, 'Reosa_'+txt1+yr) , left_on='CSA2010', right_on='CSA2010' ).plot(column='reosa', ax=ax, legend=True); plt.savefig('./output/img/Reosa_Map_Of_the_'+txt1+yr+'.jpg') def plotAndSave(ddf, txt): fig, ax = plt.subplots(1, 1) base = csa.plot(color='white', edgecolor='black') ddf.plot(ax=base, marker='o', color='green', markersize=5); plt.savefig('./output/'+txt) print('!~~~~~~~~~~~~~~~~~~~~~!STARTING!!!!!! ',yr,' !~~~~~~~~~~~~~~~~~~~~~!') # # Drop All un-needed Columns df = df[['CSA2010', 'AddressLin', 'geometry', 'DaysOnMark', 'NewTrust1L', 'Foreclosur', 'SoldDate']] # Sort the Dataset by Address # df = df.sort_values(by=['AddressLin']).reset_index() print('Given: ', len(df), ' Records') # Run this Indicators createIndicatorAndPlotChoropleth(df, 'Untouched_Records') # Plot it on a CSA Map plotAndSave(df, 'Dot_Map_Of_the_Untouched_Records_'+yr+'.jpg') # # Drop the NON CSA Records # Save a copy of the Dropped Records? # - Nah. They wont effect our calculations and removing them adds clarity. # df.drop(df[df['CSA2010'] == 'false'].index, inplace=True) print('There are ', len(df), ' Records Remaining after Droping Non-CSA Records') # Run this Indicators createIndicatorAndPlotChoropleth(df, 'Dropped_Non_CSA_Records') # Plot it on a CSA Map plotAndSave(df, 'Dot_Map_Of_the_Dropped_Non_CSA_Records_'+yr+'.jpg') # # Determines which duplicates (if any) to keep. # - first : Drop duplicates except for the first occurrence. # - last : Drop duplicates except for the last occurrence. # - False : Drop all duplicates. # Filter the dataset for duplicates in the AddressLin. # val1 = df.drop_duplicates(subset=['SoldDate', 'AddressLin'], keep='last').reset_index() print('There are', len(val1) , ' Records Remaining after Droping all but the last Duplicate on SoldDate & AddressLin') # Run this Indicators createIndicatorAndPlotChoropleth(val1, 'Dropped_Non_CSA_Records_and_Deduped') # Plot it on a CSA Map plotAndSave(val1, 'Dot_Map_Of_the_Dropped_Non_CSA_Records_and_Deduped_'+yr+'.jpg') # # Save a copy of the data that was filtered out in a new dataset # val2 = df[df.duplicated(subset=['SoldDate', 'AddressLin'], keep=False)].reset_index() print('Having Removed This Many: ', len(val2)) # Run this Indicators createIndicatorAndPlotChoropleth(val2, 'Dropped_Non_CSA_Records_and_Kept_Only_Duplicates') # Plot it on a CSA Map plotAndSave(val2, 'Dot_Map_Of_the_Dropped_Non_CSA_Records_and_Kept_Only_Duplicates_'+yr+'.jpg') return ( val1, val2, df ) # r177, val217, val317 = exploreDs(r17, '17') # r188, val218, val318 = exploreDs(r18, '18') # r189, val219, val319 = exploreDs(df, year)def inspectReosaDenominator(df, yr): print( 'Unique Foreclosure Values', df.Foreclosur.unique() ) print( 'Unique NewTrust1L Values', df.NewTrust1L.unique() ) # Dedupe on 'AddressLin', 'SoldDate print( "Original Dataset's Length: ", len(df)) temp = df.drop_duplicates(subset=['AddressLin', 'SoldDate'], keep='last') print('Deduped Length: ', len(temp)) print('Numer of Records Removed: ', len(df) - len(temp)) # Drop any NA AddressLin temp = temp.dropna(subset=['AddressLin']) print('Num Removed With No NA Addresses: ', len(df) - len(temp)) temp.head(1) # CSA2010 AddressLin SoldDate temp['count'] = 1 v1= temp.groupby(by=["CSA2010","Foreclosur"]).sum() v2= temp.groupby(by=["CSA2010","DaysOnMark"]).median() v3= temp.groupby(by=["CSA2010","NewTrust1L"]).sum() # .sort_values(by=['col1', 'col2']) v1.to_csv('reosa_Deduped'+yr+'_CSAs_Unique_Foreclosure_Counts.csv', index=False) v2.to_csv('reosa_Deduped'+yr+'_CSAs_Unique_DOM_Counts.csv', index=False) v3.to_csv('reosa_Deduped'+yr+'_CSAs_Unique_CASHSA_Counts.csv', index=False) return temp # inspectReosaDenominator(df, year) # Compare DS's for each CSA where Points Exists but A ForeClosure Value Does not.def retrieveAndcleanRbIntel(filename, year): rbintel = gpd.read_file(filename); print(len(rbintel)); # Convert to EPSG:4326 rbintel = rbintel.to_crs(epsg=4326) rbintel.crs rbintel['x'] = rbintel.geometry.x rbintel['y'] = rbintel.geometry.y # Reference: All Points base = csa.plot(color='white', edgecolor='black') rbintel.plot(ax=base, marker='o', color='green', markersize=5); # Get CSA Labels for all Points. rbintelCSA = workWithGeometryData( method='ponp', df=rbintel, polys=csa, ptsCoordCol='geometry', polygonsCoordCol='geometry', polyColorCol=False, polygonsLabel='CSA2010' ) rbintelCSA = rbintelCSA.drop('geometry',axis=1) rbintelCSA.to_csv('ponp_rbintel_'+year+'.csv', index=False) return rbintelCSA

Region17 and Region 18 should have a similar number of records

SEARCH

CONNECT WITH US

DONATE

Help us keep this resource free and available to the public. Donate now!

Donate to BNIA-JFI

CONTACT US

Baltimore Neighborhood Indicators Alliance
The Jacob France Institute
1420 N. Charles Street, Baltimore, MD 21201
410-837-4377 | bnia-jfi@ubalt.edu