#default_exp rbintel
This colab and more can be found at https://github.com/BNIA/vitalsigns.
Whats Inside?:
Indicators Used
- ✅ 30 - dom - (RBIntel) Median Number of Days on the Market
- ✅ 38 - cashsa - (RBIntel) Percentage of residential sales for cash
- ✅ 39 - reosa - (RBIntel) percentage of residential sales in foreclosure (REO)
Datasets Used
- ✔️ housing.rbintelregion_201X (30-dom, 38-cashsa, 39-reosa -> D̶a̶y̶s̶O̶n̶M̶a̶r̶k, BuyerFinan newtrust1l, foreclosur)
year = '18'
SETUP Enviornment:
Import Modules
! pip install -U -q PyDrive
! pip install geopy
! pip install geopandas
! pip install geoplot
! pip install dataplay
! pip install matplotlib
! pip install psycopg2-binary! apt-get install build-dep python-psycopg2
! apt-get install libpq-dev
! apt-get install libspatialindex-dev!pip install rtree
!pip install dexplotfrom dataplay.geoms import workWithGeometryData%%capture
# These imports will handle everything
import os
import sys
import csv
import psycopg2
import pyproj
from pyproj import Proj, transform
# conda install -c conda-forge proj4
from shapely.geometry import Point
from shapely import wkb
from shapely.wkt import loads
# https://pypi.org/project/geopy/
from geopy.geocoders import Nominatim
# In case file is KML, enable support
import fiona
fiona.drvsupport.supported_drivers['kml'] = 'rw'
fiona.drvsupport.supported_drivers['KML'] = 'rw'from IPython.display import clear_output
clear_output(wait=True)import ipywidgets as widgets
from ipywidgets import interact, interact_manual#export
import numpy as np
import pandas as pd
# import geopandas
import geopandas as gpd
# from geopandas import GeoDataFrame
Configure Enviornment
# This will just beautify the output
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# pd.set_option('display.expand_frame_repr', False)
# pd.set_option('display.precision', 2)
# pd.reset_option('max_colwidth')
pd.set_option('max_colwidth', 20)
# pd.reset_option('max_colwidth')
Prep Datasets
TPOP CSA and Baltimore
Get Baltimore
Click to toggle
csa = "https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Tpop/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson"
csa = gpd.read_file(csa);
csa.head(1) Get CSA
url2 = "https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Tpop/FeatureServer/1/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson"
csa2 = gpd.read_file(url2);
csa2['CSA2010'] = csa2['City_1']
csa2['OBJECTID'] = 56
csa2 = csa2.drop(columns=['City_1'])
csa2.head()
Append do no append Bcity. We put it on the Bottom of the df because when performing the ponp it returns only the last matching columns CSA Label.
# csa = pd.concat([csa2, csa], ignore_index=True)
csa = csa.append(csa2).reset_index(drop=True)csa.head(3)csa.tail(3)csa.head()csa.drop(columns=['Shape__Area', 'Shape__Length', 'OBJECTID'], axis=1).to_file("BCity_and_CSA.geojson", driver='GeoJSON')
Labeled Points - Analysis.
original = gpd.read_file("RBIntel_20"+year+"_BaltRegion_newfields_CSA_City.shp"); original.rename(columns={ 'CSA':'CSA2010', 'BaltCity':'InBaltimore'}, inplace=True)
df = original[ original['CSA2010'].notnull() | original['InBaltimore'].notnull() ]
print( 'Total # Rows: ', original.shape[0] ) # rows, columns
print( '# Before | After')
print( '# Where BCity.isnull/notnull: ', original.InBaltimore.isnull().sum(), '|', original.InBaltimore.notnull().sum() );
print( '# where CSA2010.isnull/notnull: ', original.CSA2010.isnull().sum(), '|', original.CSA2010.notnull().sum() );
print( '# Where CSA and/or Baltimore lbl Exists: ', df.shape[0])df.describe().to_csv('18_UnfilteredOnForeDescriptions.csv')df[ df['Foreclosur'].str.contains('.Y.|.Y|Y.|Y', regex=True, na=False) ].describe().to_csv('18_filteredOnForeDescriptions.csv')df['count'] = 1df.groupby('CSA2010').sum(numeric_only=True)['count'].to_csv('18_UnfilteredOnFore.csv')df[ df['Foreclosur'].str.contains('.Y.|.Y|Y.|Y', regex=True, na=False) ].groupby('CSA2010').sum(numeric_only=True)['count'].to_csv('18_FilteredOnFore.csv')# from VitalSigns.utils import *
# df = check_labels(original)df.CSA2010 = df.CSA2010.fillna('Baltimore City')df.CSA2010.unique()
You need to run the indicator functions at least once before running this. Also just doesn't work atm.
VS Indicator Functions
Preview
original.shape[0]
print(' ')
df.shape[0]
print(' ')
df[ df['DaysonMark'] > 0].shape[0]
print(' ')
df[ df['Foreclosur'].str.contains('.Y.|.Y|Y.|Y', regex=True, na=False) ].shape[0]
NewTrust1L is dead. long live BuyerFinan
df['Foreclosur'].value_counts()df['BuyerFinan'].value_counts()df[ df['DaysonMark'] > 0].shape[0]df[ df['DaysonMark'] > 0][['CSA2010','DaysonMark']].groupby('CSA2010').agg(['median', 'count']).head(4)
DOM 30
# https://services1.arcgis.com/mVFRs7NF4iFitgbY/arcgis/rest/services/dom/FeatureServer/layers
# https://bniajfi.org/indicators/Housing%20And%20Community%20Development/dom
lbl = 'Median Number of Days on the Market'
TopicArea = 'Housing And Community Development'
YearsAvailable = '2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020'
long_Description: """
The median number of days that homes listed for sale sit on the public market in a given area.
This time period is from the date it is listed for sale till the day the contract of sale is signed.
Private (non-listed) home sale transactions are not included in this indicator.
The median days on market is used as opposed to the average so that both extremely high and extremely
low days on the market do not distort the length of time for which homes are listed on the market.
"""
Used to be we'd calc our own DOM field but RBIntel added it as a field
#export
def vsDom(df, yr):
id = '30'
shortname = 'dom'
fincol = id+'-'+shortname+year
# Create the Numerator and Denominator
numer = df[['DaysonMark','CSA2010']].copy()
# Filter Em
numer = numer[ numer['DaysonMark'] > 0]
print( numer.shape[0] )
# Get Bcity Val
bCityVal = numer.median(numeric_only=True)['DaysonMark']
# Group by CSA
numer = numer.groupby('CSA2010').median(numeric_only=True) # use .median to calculate DOM.
# Make sure ALL csas and BaltimoreCity are included and sorted.
numer = csa.merge( numer, left_on='CSA2010', right_on='CSA2010', how='outer' )
numer.drop( columns=['geometry', 'Shape__Length','Shape__Area', 'OBJECTID', 'tpop10'], inplace=True)
# Bcity is the median of all the records an not the community medians.
# Incorrect Bcity median IFF Groupby keeps a 'False' row (index 56)
numer.at[55,'DaysonMark']= bCityVal
# Perform the calculation
numer[fincol] = numer['DaysonMark']
compareYears = gpd.read_file("https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/"+shortname.capitalize()+"/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson");
goback = 2 if year == '19' else 3 if year == '20' else 1
prevYear = shortname + str( int(year) - goback )
if prevYear in compareYears.columns:
numer = numer.merge( compareYears[['CSA2010', prevYear]], left_on='CSA2010', right_on='CSA2010', how='outer' )
numer['change'] = numer[id+'-'+shortname+year] - numer[ prevYear ]
numer['percentChange'] = numer['change' ] / numer[ prevYear ] * 100
numer['change'] = numer['change'].apply(lambda x: "{:.2f}".format(x) )
print( 'Records Matching Query: ', numer.size / len(numer.columns) )
return numert = vsDom(df,year)
t.head(2)
t.tail(2)
t.to_csv('30_dom_'+year+'.csv', index=False)
CASHSA 38
NewTrust1L is dead. long live BuyerFinan
# https://services1.arcgis.com/mVFRs7NF4iFitgbY/arcgis/rest/services/cashsa/FeatureServer/layers
# https://bniajfi.org/indicators/Housing%20And%20Community%20Development/cashsa
lbl = 'Percentage of Residential Sales for Cash'
TopicArea = 'Housing And Community Development'
YearsAvailable = '2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018'
long_Description: """
The percent of homes and condominiums sold for cash out of all residential properties sold in a calendar year.
These types of sales tend to signify investor-based purchases as homes purchased for cash either
become rental properties or later sold again in an effort to generate a profit.
"""
original_sql = = """
with numerator AS (
select (sum(
case
when newtrust1l = $$Cash$$
then 1
else 0
end)::numeric) as result, csa
from vital_signs.match_csas_and_bc_by_geom('housing.rbintelregion_2017', 'gid', 'the_geom') a
left join housing.rbintelregion_2017 b on a.gid = b.gid
group by csa
),
denominator AS (
select (sum(
case
when csa_present
then 1
else NULL
end)::numeric
) as result, csa
from vital_signs.match_csas_and_bc_by_geom('housing.rbintelregion_2017', 'gid', 'the_geom') a
left join housing.rbintelregion_2017 b on a.gid = b.gid
group by csa, the_pop
),
tbl AS (
select denominator.csa,(numerator.result / denominator.result)*(100::numeric) as result
from numerator left join denominator on numerator.csa = denominator.csa
)
select * from tbl where 1 = 1 ORDER BY csa ASC;
"""#export
def cashsa(df, yr):
id = '38'
shortname = 'cashsa'
fincol = id+'-'+shortname+year
# Create the Numerator and Denominator
numer = df[['BuyerFinan','CSA2010']].copy()
numer['count'] = 1
denom = numer.copy()
# Filter Em
numer = numer[ numer['BuyerFinan'].str.contains('.Cash.|.Cash|Cash.|Cash', regex=True, na=False) ]
print("LENGTH AFTER FILTER: ", len(numer) )
# Get Bcity Val
bCityVal = numer.sum(numeric_only=True)['count']
bCityValDenom = denom.sum(numeric_only=True)['count']
# Group by CSA
numer = numer.groupby('CSA2010').sum(numeric_only=True)
denom = denom.groupby('CSA2010').sum(numeric_only=True)
# Make sure ALL csas and BaltimoreCity are included and sorted.
numer = csa.merge( numer, left_on='CSA2010', right_on='CSA2010', how='outer' )
numer.drop( columns=['geometry', 'Shape__Length','Shape__Area', 'OBJECTID', 'tpop10'], inplace=True)
denom = csa.merge( denom, left_on='CSA2010', right_on='CSA2010', how='outer' )
denom.drop( columns=['geometry', 'Shape__Length','Shape__Area', 'OBJECTID', 'tpop10'], inplace=True)
# Bcity is the sum of the community sums.
# Incorrect Bcity Sum IFF Groupby keeps a 'False' row (index 56)
numer.at[55,'count']= bCityVal
denom.at[55,'count']= bCityValDenom
# Perform the calculation
numer[fincol] = numer['count'] / denom['count'] * 100
compareYears = gpd.read_file("https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/"+shortname.capitalize()+"/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson");
goback = 2 if year == '19' else 3 if year == '20' else 1
prevYear = shortname + str( int(year) - goback )
if prevYear in compareYears.columns:
numer = numer.merge( compareYears[['CSA2010', prevYear]], left_on='CSA2010', right_on='CSA2010', how='outer' )
numer['change'] = numer[id+'-'+shortname+year] - numer[ prevYear ]
numer['percentChange'] = numer['change' ] / numer[ prevYear ] * 100
numer['change'] = numer['change'].apply(lambda x: "{:.2f}".format(x) )
print( 'Records Matching Query: ', numer.size / len(numer.columns) )
return numerresp = cashsa(df, year)
resp.head(2)
resp.tail(2)
cashsa(df,year).to_csv('38_cashsa_'+year+'.csv', index=False)
REOSA 39
# https://services1.arcgis.com/mVFRs7NF4iFitgbY/arcgis/rest/services/reosa/FeatureServer/layers
# https://bniajfi.org/indicators/Housing%20And%20Community%20Development/reosa/2017
lbl = 'Percentage of Residential Sales in Foreclosure (REO)'
TopicArea = 'Housing And Community Development'
YearsAvailable = '2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020'
long_Description: """
The portion of the homes and condominiums sold that were identified as
being owned by the bank (REO) out of all residential properties sold in a calendar year.
"""
original_sql = """
with numerator AS (
select (sum(
case
when foreclosur = $$Y$$
then 1
else 0
end)::numeric) as result, csa
from vital_signs.match_csas_and_bc_by_geom('housing.rbintelregion_2017', 'gid', 'the_geom') a
left join housing.rbintelregion_2017 b on a.gid = b.gid
group by csa
),
denominator AS (
select (sum(
case
when csa_present
then 1
else NULL
end)::numeric
) as result, csa
from vital_signs.match_csas_and_bc_by_geom('housing.rbintelregion_2017', 'gid', 'the_geom') a
left join housing.rbintelregion_2016 b on a.gid = b.gid
group by csa, the_pop
),
tbl AS (
select denominator.csa,(numerator.result / denominator.result)*(100::numeric) as result
from numerator left join denominator on numerator.csa = denominator.csa
) select * from tbl where 1 = 1 ORDER BY csa ASC;
"""#export
def reosa(df, yr):
id = '39'
shortname = 'reosa'
fincol = id+'-'+shortname+year
# Create the Numerator and Denominator
numer = df[['Foreclosur','CSA2010']].copy()
numer['count'] = 1
denom = numer.copy()
denom['count'] = 1
# Filter Em
numer = numer[ numer['Foreclosur'].str.contains('.Y.|.Y|Y.|Y', regex=True, na=False) ]
print( numer['Foreclosur'].value_counts() )
# Get Bcity Val
bCityVal = numer.sum(numeric_only=True)['count']
bCityValDenom = denom.sum(numeric_only=True)['count']
# Group by CSA
numer = numer.groupby('CSA2010').sum(numeric_only=True)
denom = denom.groupby('CSA2010').sum(numeric_only=True)
# Make sure ALL csas and BaltimoreCity are included and sorted.
numer = csa.merge( numer, left_on='CSA2010', right_on='CSA2010', how='outer' )
numer.drop( columns=['geometry', 'Shape__Length','Shape__Area', 'OBJECTID', 'tpop10'], inplace=True)
denom = csa.merge( denom, left_on='CSA2010', right_on='CSA2010', how='outer' )
denom.drop( columns=['geometry', 'Shape__Length','Shape__Area', 'OBJECTID', 'tpop10'], inplace=True)
# Bcity is the sum of the community sums.
# Incorrect Bcity Sum IFF Groupby keeps a 'False' row (index 56)
numer.at[55,'count']= bCityVal
denom.at[55,'count']= bCityValDenom
# Perform the calculation
numer['denomCount'] = denom['count']
numer[fincol] = numer['count'] / numer['denomCount'] * 100
compareYears = gpd.read_file("https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/"+shortname.capitalize()+"/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson");
goback = 2 if year == '19' else 3 if year == '20' else 1
prevYear = shortname + str( int(year) - goback )
if prevYear in compareYears.columns:
numer = numer.merge( compareYears[['CSA2010', prevYear]], left_on='CSA2010', right_on='CSA2010', how='outer' )
numer['change'] = numer[id+'-'+shortname+year] - numer[ prevYear ]
numer['percentChange'] = numer['change' ] / numer[ prevYear ] * 100
numer['change'] = numer['change'].apply(lambda x: "{:.2f}".format(x) )
print( 'Records Matching Query: ', numer.size / len(numer.columns) )
return numerresp = reosa(df, year)
resp.head()
resp.tail()
resp.to_csv('39_reosa_'+year+'.csv', index=False)
OLD_FNS
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.duplicated.html
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html
# https://stackoverflow.com/questions/41308763/python-pandas-df-duplicated-and-df-drop-duplicated-not-finding-all-duplicates
def exploreDs(df, yr):
def createIndicatorAndPlotChoropleth(ddf, txt1):
fig, ax = plt.subplots(1, 1)
csa.merge( vsDom(df, 'DOM_'+txt1+yr) , left_on='CSA2010', right_on='CSA2010' ).plot(column='dom', ax=ax, legend=True); plt.savefig('./output/img/DOM_Map_Of_the_'+txt1+yr+'.jpg')
csa.merge( vsCashsa(df, 'Cashsa_'+txt1+yr) , left_on='CSA2010', right_on='CSA2010' ).plot(column='cashsa', ax=ax, legend=True); plt.savefig('./output/img/Cashsa_Map_Of_the_'+txt1+yr+'.jpg')
csa.merge( vsReosa(df, 'Reosa_'+txt1+yr) , left_on='CSA2010', right_on='CSA2010' ).plot(column='reosa', ax=ax, legend=True); plt.savefig('./output/img/Reosa_Map_Of_the_'+txt1+yr+'.jpg')
def plotAndSave(ddf, txt):
fig, ax = plt.subplots(1, 1)
base = csa.plot(color='white', edgecolor='black')
ddf.plot(ax=base, marker='o', color='green', markersize=5);
plt.savefig('./output/'+txt)
print('!~~~~~~~~~~~~~~~~~~~~~!STARTING!!!!!! ',yr,' !~~~~~~~~~~~~~~~~~~~~~!')
#
# Drop All un-needed Columns
df = df[['CSA2010', 'AddressLin', 'geometry', 'DaysOnMark', 'NewTrust1L', 'Foreclosur', 'SoldDate']]
# Sort the Dataset by Address
#
df = df.sort_values(by=['AddressLin']).reset_index()
print('Given: ', len(df), ' Records')
# Run this Indicators
createIndicatorAndPlotChoropleth(df, 'Untouched_Records')
# Plot it on a CSA Map
plotAndSave(df, 'Dot_Map_Of_the_Untouched_Records_'+yr+'.jpg')
#
# Drop the NON CSA Records
# Save a copy of the Dropped Records?
# - Nah. They wont effect our calculations and removing them adds clarity.
#
df.drop(df[df['CSA2010'] == 'false'].index, inplace=True)
print('There are ', len(df), ' Records Remaining after Droping Non-CSA Records')
# Run this Indicators
createIndicatorAndPlotChoropleth(df, 'Dropped_Non_CSA_Records')
# Plot it on a CSA Map
plotAndSave(df, 'Dot_Map_Of_the_Dropped_Non_CSA_Records_'+yr+'.jpg')
#
# Determines which duplicates (if any) to keep.
# - first : Drop duplicates except for the first occurrence.
# - last : Drop duplicates except for the last occurrence.
# - False : Drop all duplicates.
# Filter the dataset for duplicates in the AddressLin.
#
val1 = df.drop_duplicates(subset=['SoldDate', 'AddressLin'], keep='last').reset_index()
print('There are', len(val1) , ' Records Remaining after Droping all but the last Duplicate on SoldDate & AddressLin')
# Run this Indicators
createIndicatorAndPlotChoropleth(val1, 'Dropped_Non_CSA_Records_and_Deduped')
# Plot it on a CSA Map
plotAndSave(val1, 'Dot_Map_Of_the_Dropped_Non_CSA_Records_and_Deduped_'+yr+'.jpg')
#
# Save a copy of the data that was filtered out in a new dataset
#
val2 = df[df.duplicated(subset=['SoldDate', 'AddressLin'], keep=False)].reset_index()
print('Having Removed This Many: ', len(val2))
# Run this Indicators
createIndicatorAndPlotChoropleth(val2, 'Dropped_Non_CSA_Records_and_Kept_Only_Duplicates')
# Plot it on a CSA Map
plotAndSave(val2, 'Dot_Map_Of_the_Dropped_Non_CSA_Records_and_Kept_Only_Duplicates_'+yr+'.jpg')
return ( val1, val2, df )
# r177, val217, val317 = exploreDs(r17, '17')
# r188, val218, val318 = exploreDs(r18, '18')
# r189, val219, val319 = exploreDs(df, year)def inspectReosaDenominator(df, yr):
print( 'Unique Foreclosure Values', df.Foreclosur.unique() )
print( 'Unique NewTrust1L Values', df.NewTrust1L.unique() )
# Dedupe on 'AddressLin', 'SoldDate
print( "Original Dataset's Length: ", len(df))
temp = df.drop_duplicates(subset=['AddressLin', 'SoldDate'], keep='last')
print('Deduped Length: ', len(temp))
print('Numer of Records Removed: ', len(df) - len(temp))
# Drop any NA AddressLin
temp = temp.dropna(subset=['AddressLin'])
print('Num Removed With No NA Addresses: ', len(df) - len(temp))
temp.head(1) # CSA2010 AddressLin SoldDate
temp['count'] = 1
v1= temp.groupby(by=["CSA2010","Foreclosur"]).sum()
v2= temp.groupby(by=["CSA2010","DaysOnMark"]).median()
v3= temp.groupby(by=["CSA2010","NewTrust1L"]).sum() # .sort_values(by=['col1', 'col2'])
v1.to_csv('reosa_Deduped'+yr+'_CSAs_Unique_Foreclosure_Counts.csv', index=False)
v2.to_csv('reosa_Deduped'+yr+'_CSAs_Unique_DOM_Counts.csv', index=False)
v3.to_csv('reosa_Deduped'+yr+'_CSAs_Unique_CASHSA_Counts.csv', index=False)
return temp
# inspectReosaDenominator(df, year)
# Compare DS's for each CSA where Points Exists but A ForeClosure Value Does not.def retrieveAndcleanRbIntel(filename, year):
rbintel = gpd.read_file(filename);
print(len(rbintel));
# Convert to EPSG:4326
rbintel = rbintel.to_crs(epsg=4326)
rbintel.crs
rbintel['x'] = rbintel.geometry.x
rbintel['y'] = rbintel.geometry.y
# Reference: All Points
base = csa.plot(color='white', edgecolor='black')
rbintel.plot(ax=base, marker='o', color='green', markersize=5);
# Get CSA Labels for all Points.
rbintelCSA = workWithGeometryData(
method='ponp', df=rbintel, polys=csa, ptsCoordCol='geometry',
polygonsCoordCol='geometry', polyColorCol=False, polygonsLabel='CSA2010'
)
rbintelCSA = rbintelCSA.drop('geometry',axis=1)
rbintelCSA.to_csv('ponp_rbintel_'+year+'.csv', index=False)
return rbintelCSA
Region17 and Region 18 should have a similar number of records