# default_exp fares
This colab and more can be found at https://github.com/BNIA/vitalsigns.
Whats Inside?:
The Guided Walkthrough
This notebook was made to create the following Housing Vital Signs Indicators:
Indicators Used
- ✅ 29 - salepr - (Fares) Median Price of Homes Sold
- ✅ 31 - shomes - (Fares) Number of Homes Sold
Datasets Used
- ✅ foreclosures.fares_201X (29-salepr, 31-shomes -> saledate primcatcod landusecod saleamount)
❌
year = '19'
Guided Walkthrough
SETUP Enviornment:
Import Modules
! pip install -U -q PyDrive
! pip install geopy
! pip install geopandas
! pip install geoplot
! pip install dataplay
! pip install matplotlib
! pip install psycopg2-binary! apt-get install build-dep python-psycopg2
! apt-get install libpq-dev
! apt-get install libspatialindex-dev!pip install rtree
!pip install dexplotfrom dataplay.geoms import workWithGeometryData%%capture
# These imports will handle everything
import os
import sys
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
from geopandas import GeoDataFrame
import psycopg2
import pyproj
from pyproj import Proj, transform
# conda install -c conda-forge proj4
from shapely.geometry import Point
from shapely import wkb
from shapely.wkt import loads
# https://pypi.org/project/geopy/
from geopy.geocoders import Nominatim
# In case file is KML, enable support
import fiona
fiona.drvsupport.supported_drivers['kml'] = 'rw'
fiona.drvsupport.supported_drivers['KML'] = 'rw'from IPython.display import clear_output
clear_output(wait=True)import ipywidgets as widgets
from ipywidgets import interact, interact_manual
Configure Enviornment
# This will just beautify the output
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# pd.set_option('display.expand_frame_repr', False)
# pd.set_option('display.precision', 2)
# pd.reset_option('max_colwidth')
pd.set_option('max_colwidth', 20)
# pd.reset_option('max_colwidth')
Prep Datasets
TPOP CSA and Baltimore
Get Baltimore
Click to toggle
csa = "https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Tpop/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson"
csa = gpd.read_file(csa);
csa.head(1) Get CSA
url2 = "https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Tpop/FeatureServer/1/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson"
csa2 = gpd.read_file(url2);
csa2['CSA2010'] = csa2['City_1']
csa2['OBJECTID'] = 56
csa2 = csa2.drop(columns=['City_1'])
csa2.head()
Append do no append Bcity. We put it on the Bottom of the df because when performing the ponp it returns only the last matching columns CSA Label.
# csa = pd.concat([csa2, csa], ignore_index=True)
csa = csa.append(csa2).reset_index(drop=True)csa.head(3)csa.tail(3)csa.head()csa.drop(columns=['Shape__Area', 'Shape__Length', 'OBJECTID'], axis=1).to_file("BCity_and_CSA.geojson", driver='GeoJSON')
Fares
import pandas as pd
import geopandas
original = gpd.read_file("FARES_20"+year+"_CSACity.shp", geometry='geometry');
original.columns original.rename(columns={ 'CSA':'CSA2010', 'BaltCity':'InBaltimore'}, inplace=True)
df = original[ original['CSA2010'].notnull() | original['InBaltimore'].notnull() ]print('After filtering records where a CSA or Baltimore geo-code match Exists')
print( 'All rows Before Filter: ', original.shape[0] ) # rows, columns
print( '# w BCity.isnull: ', df.InBaltimore.isnull().sum() ); bmorow = df[ df.CSA2010.isnull() ].shape[0]
print( '# w CSA2010.isnull: ', bmorow ); csarow = df[ df.CSA2010.notnull() ].shape[0]
print( '# w CSA2010.notnull: ', csarow );
print( '# rows After Filter: ', df.shape[0],'==',csarow,'+',bmorow,'==', csarow + bmorow); # add baltimore city
df.CSA2010 = df.CSA2010.fillna('Baltimore City')fares = df.copy()
fares.head(1)
Not Used
# Convert Geom to Coords
# fares["x"] = fares.centroid.map(lambda p: p.x)
# fares["y"] = fares.centroid.map(lambda p: p.y)
fares['x'] = fares.geometry.x
fares['y'] = fares.geometry.y
# fares.head(5)
fares = fares[ fares.geometry.y > 38 ]
fares = fares[ fares.geometry.x < -70 ]
fares = fares[ fares.geometry.x > -80 ]# Reference: All Points
base = csa.plot(color='white', edgecolor='black')
fares.plot(ax=base, marker='o', color='green', markersize=5);fares = fares[['SALEDATE', 'SALEAMOUNT', 'LANDUSECOD', 'geometry']]
fares.head()# Get CSA Labels for all Points.
faresCsa = workWithGeometryData(
method='ponp', df=fares, polys=csa, ptsCoordCol='geometry',
polygonsCoordCol='geometry', polygonsLabel='CSA2010'
)
faresCsa = faresCsa.drop('geometry',axis=1)
faresCsa.to_csv('ponp_fares.csv', index=False)
faresCsa.head(1)
Indicators
faresCsa = fares.copy()faresCsa.dtypesfaresCsa.LANDUSECOD.unique()
29 - salepr (Fares)
landusecode was discontinued a few years ago
Q: The original sql queries say we need to filter on for ['HIGH RISE CONDO', 'CONDOMINIUM', 'RESIDENTIAL (NEC)', 'MID RISE CONDO', 'MULTI FAMILY DWELLING', 'MULTI FAMILY 10 UNITS LESS', 'SFR', 'TOWNHOUSE/ROWHOUSE'] but the column is missing
A: as of 3/30/2021 I (Carlos) officially declare that the we will not be using landusecod in our query and are going to be using PROPERTYIN on fields 'CONDOMINIUM', 'SINGLE FAMILY'.
oldquerilandusecodfilter = "HIGH RISE CONDO|CONDOMINIUM|RESIDENTIAL (NEC)|MID RISE CONDO|MULTI FAMILY DWELLING|MULTI FAMILY 10 UNITS LESS|SFR|TOWNHOUSE/ROWHOUSE"
# salepr - Median Price of Homes Sold
# https://services1.arcgis.com/mVFRs7NF4iFitgbY/arcgis/rest/services/salepr/FeatureServer/layers
# Numerator: Fares
# Denominator: None
long_Description: """
The median home sales price is the middle value of the prices for which homes are sold (both market and private transactions)
within a calendar year. The median value is used as opposed to the average so that both extremely high and extremely low prices
do not distort the prices for which homes are sold. This measure does not take into account the assessed value of a property.
"""
salepr_SQL = """
SELECT fullbounds.csa, sQuery.Result
FROM boundaries.csa2010 as fullbounds
LEFT JOIN (SELECT bounds.csa AS Boundary, median(Tables.saleamount::numeric(10,2))::numeric(10,2) as Result
FROM housing.fares_2016 AS Tables
JOIN boundaries.csa2010 AS bounds
ON ST_Contains(bounds.the_geom, Tables.the_geom)
where (landusecod LIKE 'HIGH RISE CONDO' OR
landusecod LIKE 'CONDOMINIUM' OR
landusecod LIKE 'RESIDENTIAL (NEC)' OR
landusecod LIKE 'MID RISE CONDO' OR
landusecod LIKE 'MULTI FAMILY DWELLING' OR
landusecod LIKE 'MULTI FAMILY 10 UNITS LESS' OR
landusecod LIKE 'SFR' OR
landusecod LIKE 'TOWNHOUSE/ROWHOUSE'
) AND
( primcatcod LIKE 'ARMS LENGTH') AND
( saledate between '20160101' and '20161231') AND
( saleamount > 1000 )
GROUP BY Boundary
ORDER BY Boundary) as sQuery
ON fullbounds.csa = sQuery.Boundary
ORDER BY fullbounds.csa
"""
# landusecod No longer being used
salepr_translation = """
Select median saleamount from CSA Where (
( landusecod LIKE ['HIGH RISE CONDO', 'CONDOMINIUM', 'RESIDENTIAL (NEC)', 'MID RISE CONDO',
'MULTI FAMILY DWELLING', 'MULTI FAMILY 10 UNITS LESS', 'SFR', 'TOWNHOUSE/ROWHOUSE'] ) &
( primcatcod LIKE 'ARMS LENGTH') &
( '20160101' < saledate > '20161231') &
( saleamount > 1000) )
"""#export
# Copy the Data
faresCsa = fares.copy()
faresCsa['SALEDATE'] = pd.to_datetime(faresCsa['SALEDATE'], format='%Y%m%d') # ,infer_datetime_format=True)
# Query the Data
salepr = faresCsa[
(faresCsa['PROPERTYIN'].str.contains('CONDOMINIUM|SINGLE FAMILY', regex=True) )
& (faresCsa['SALEDATE'] >= '20'+year+'-01-01')
& (faresCsa['SALEDATE'] <= '20'+year+'-12-31')
& (faresCsa['SALEAMOUNT'] > 1000 )
& (faresCsa['PRIMARYCAT'] == 'ARMS LENGTH' )
]
# Prep and Save the Filtered Records
salepr.rename(columns={ 'SALEAMOUNT':'29-salepr'+year}, inplace=True)
salepr.drop(columns=['geometry', 'PRIMARYCAT', 'PROPERTYIN', 'LANDUSECOD', 'SALEDATE' ], inplace=True)
salepr.to_csv('fares_filtered_'+year+'.csv')
# *Special*: # UPDATE HERE AND THEN GROUP
salepr = salepr.append({'CSA2010': 'Baltimore City' , '29-salepr'+year : salepr['29-salepr'+year].median() } , ignore_index=True)
salepr = salepr.groupby('CSA2010').median(numeric_only=True)
# Make sure ALL csas and BaltimoreCity are included and sorted.
salepr = csa.merge( salepr, left_on='CSA2010', right_on='CSA2010', how='outer' )
salepr.drop(columns=['OBJECTID', 'Shape__Length', 'Shape__Area', 'geometry' ], inplace=True)
salepr = salepr[['CSA2010', '29-salepr'+year]]
display( salepr.head(2) )
salepr.tail(2)
salepr.to_csv('29-salepr'+year+'.csv')
# & (faresCsa['PRIMCATCOD'] == 'ARMS LENGTH' )
# faresCsa['LANDUSECOD'].str.contains('HIGH RISE CONDO|CONDOMINIUM|RESIDENTIAL (NEC)|MID RISE CONDO|MULTI FAMILY DWELLING|MULTI FAMILY 10 UNITS LESS|SFR|TOWNHOUSE/ROWHOUSE', regex=False) faresCsa = faresCsa.astype({'SALEAMOUNT': 'int32'})pd.options.display.float_format = '{:.2f}'.format
t = faresCsa[
(faresCsa.CSA2010 == 'Greater Roland Park/Poplar Hill') &
(faresCsa['PROPERTYIN'].str.contains('CONDOMINIUM|SINGLE FAMILY', regex=True) ) &
(faresCsa['SALEDATE'] >= '20'+year+'-01-01') &
(faresCsa['SALEDATE'] <= '20'+year+'-12-31') &
(faresCsa['SALEAMOUNT'] > 1000 ) &
(faresCsa['PRIMARYCAT'] == 'ARMS LENGTH' )
][['CSA2010','SALEAMOUNT', 'PROPERTYIN']].sort_values(by='SALEAMOUNT')
t.to_csv('GRP salepr records.csv')
t.plot.bar(x='CSA2010', y='SALEAMOUNT', rot=0)t.head(20)faresCsa.PROPERTYIN.unique()pd.options.display.float_format = '{:.2f}'.format
faresCsa[
(faresCsa.CSA2010 == 'Greater Roland Park/Poplar Hill') &
( faresCsa.PRIMARYCAT == 'ARMS LENGTH')
].plot.bar(x='CSA2010', y='SALEAMOUNT', rot=0)pd.options.display.float_format = '{:.2f}'.format
faresCsa[
(faresCsa.CSA2010 == 'Greater Roland Park/Poplar Hill') &
(faresCsa['PROPERTYIN'].str.contains('CONDOMINIUM|SINGLE FAMILY', regex=True) )
].head(10).plot.bar(x='CSA2010', y='SALEAMOUNT', rot=0)faresCsa.PRIMARYCAT.unique()faresCsa[ faresCsa['PRIMARYCAT'] == 'ARMS LENGTH' ]['CSA2010'].value_counts()salepr.head(22)# faresCsa[ (faresCSA['CSA2010']=='Midtown') & (faresCsa['SALEAMOUNT'] > 1000) & (faresCsa['PRIMARYCAT'] == 'ARMS LENGTH' ) & (faresCsa['PROPERTYIN'].str.contains('CONDOMINIUM|SINGLE FAMILY', regex=True) ) ].head(3)# faresCsa[ (faresCSA['CSA2010']=='Midtown') & (faresCsa['SALEAMOUNT'] > 1000) ].head(3)
31 - shomes - (Fares)
# shomes - Number of Homes Sold
# https://services1.arcgis.com/mVFRs7NF4iFitgbY/arcgis/rest/services/shomes/FeatureServer/layers
# Numerator: housing.fares_201X
# Denominator: None
long_Description: """The total number of residential properties sold in a calendar year."""
_SQL = """
with tbl AS (
select ( sum( case
when (
landusecod = 'HIGH RISE CONDO' OR
landusecod = 'CONDOMINIUM' OR
landusecod = 'RESIDENTIAL (NEC)' OR
landusecod = 'MID RISE CONDO' OR
landusecod = 'MULTI FAMILY DWELLING' OR
landusecod = 'MULTI FAMILY 10 UNITS LESS' OR
landusecod = 'SFR' OR
landusecod = 'TOWNHOUSE/ROWHOUSE'
) AND
(primcatcod = 'ARMS LENGTH') AND
(saledate between '20160101' and '20161231') AND
(saleamount > 1000) then 1 else 0 end)::numeric ) as result, csa
from vital_signs.match_csas_and_bc_by_geom('housing.fares_2016', 'gid', 'the_geom') a
left join housing.fares_2016 b on a.gid = b.gid group by csa
)
update vital_signs.data
set shomes = result from tbl where data.csa = tbl.csa and data_year = '2016';
"""
_translation = """
Sum records where
landusecod in ['HIGH RISE CONDO', 'CONDOMINIUM', 'RESIDENTIAL (NEC)', 'MID RISE CONDO', 'MULTI FAMILY DWELLING', 'MULTI FAMILY 10 UNITS LESS', 'SFR', 'TOWNHOUSE/ROWHOUSE']
AND (primcatcod = 'ARMS LENGTH')
AND (saledate between '20160101' and '20161231')
AND (saleamount > 1000)
"""#export
# Copy the Data
faresCsa = fares.copy()
faresCsa['SALEDATE'] = pd.to_datetime(faresCsa['SALEDATE'], format='%Y%m%d') # ,infer_datetime_format=True)
# Query the Data
shomes = faresCsa[
(faresCsa['PROPERTYIN'].str.contains('CONDOMINIUM|SINGLE FAMILY', regex=True) )
& (faresCsa['SALEDATE'] >= '20'+year+'-01-01')
& (faresCsa['SALEDATE'] <= '20'+year+'-12-31')
& (faresCsa['SALEAMOUNT'] > 1000 )
& (faresCsa['PRIMARYCAT'] == 'ARMS LENGTH' )
]
# Prep and Save the Filtered Records
shomes['31-shomes'+year] = 1
# shomes.rename(columns={ 'SALEAMOUNT':'29-shomes'+year}, inplace=True)
# shomes.drop(columns=['geometry', 'PRIMARYCAT', 'PROPERTYIN', 'LANDUSECOD', 'SALEDATE', 'SALEAMOUNT'], inplace=True)
shomes = shomes[['CSA2010','31-shomes'+year]]
shomes.to_csv('fares_filtered_'+year+'.csv')
# *Special*: # UPDATE HERE AND THEN GROUP
# shomes = shomes.append({'CSA2010': 'Baltimore City', '31-shomes'+year : shomes['31-shomes'+year].sum() } , ignore_index=True)
shomes = shomes.groupby('CSA2010').sum(numeric_only=True)
# Make sure ALL csas and BaltimoreCity are included and sorted.
shomes = csa.merge( shomes, left_on='CSA2010', right_on='CSA2010', how='outer' )
shomes.drop(columns=['OBJECTID', 'Shape__Length', 'Shape__Area', 'geometry' ], inplace=True)
# *SPECIAL* Update the baltimore CSA.
shomes.at[55] = shomes.sum(numeric_only=True)
shomes.at[55, 'CSA2010'] = 'Baltimore City'
display( shomes.head(2) )
shomes.tail(2)
shomes.to_csv('31-shomes'+year+'.csv')
# & (faresCsa['PRIMCATCOD'] == 'ARMS LENGTH' )
# faresCsa['LANDUSECOD'].str.contains('HIGH RISE CONDO|CONDOMINIUM|RESIDENTIAL (NEC)|MID RISE CONDO|MULTI FAMILY DWELLING|MULTI FAMILY 10 UNITS LESS|SFR|TOWNHOUSE/ROWHOUSE', regex=False)
Merge and Save Both