# default_exp bpd
58 shootl - R
Convert Shooting
WITH tbl AS (
SELECT (Sum( CASE WHEN csa_present THEN 1 ELSE 0 END)::numeric * 1000 )/the_pop AS result, csa
FROM vital_signs.Match_csas_and_bc_by_geom('crime.part1_2017', 'gid', 'the_geom') a
LEFT JOIN crime.part1_2017 b
ON a.gid = b.gid
GROUP BY csa, the_pop
)
select * from tbl where 1 = 1
Not Used
SQL WAS SCRAPPED.
Processed Manually.
Todo:
- Refractor Queries for no Point In Polygons
- Wrap as Function
Whats Inside?:
- Retrieve ACS Datasets
- Retrieve Crime Datasets
- Filter Down Columns
- Append CSA label w a Points in Polygons operation.
- One hot encode variables
- Aggregate points along CSAs
- Create indicators.
- Visualize these aggregates.
Indicators Used
- ✅ 50 - crime - (Part1, Tpop) Part 1 Crime Rate per 1,000 Residents
- ✅ 52 - vio - (Part1, Tpop) Violent Crime Rate per 1,000 Residents
- ✅ 53 - prop - (Part1, Tpop) Property Crime Rate per 1,000 Residents
- ✅ 59 - gunhom - (Part1, Tpop) Number of Gun-Related Homicides per 1,000 Residents
Datasets Used
- ✅ Crime_Part_1 - (50-crime, 52-vio, 53-prop, 59-gunhom -> Descriptio, Weapon)
- ✅ Baltvac - (50-crime, 52-vio, 53-prop, 59-gunhom -> Tpop)
SETUP Enviornment:
Import Modules
! pip install -U -q PyDrive
! pip install geopy
! pip install geopandas
! pip install geoplot
! pip install dataplay
! pip install matplotlib
! pip install psycopg2-binary! apt-get install build-dep python-psycopg2
! apt-get install libpq-dev
! apt-get install libspatialindex-dev!pip install rtree
!pip install dexplotfrom dataplay.geoms import workWithGeometryData%%capture
# These imports will handle everything
import os
import sys
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import geopandas as gpd
from geopandas import GeoDataFrame
import psycopg2
import pyproj
from pyproj import Proj, transform
# conda install -c conda-forge proj4
from shapely.geometry import Point
from shapely import wkb
from shapely.wkt import loads
# https://pypi.org/project/geopy/
from geopy.geocoders import Nominatim
# In case file is KML, enable support
import fiona
fiona.drvsupport.supported_drivers['kml'] = 'rw'
fiona.drvsupport.supported_drivers['KML'] = 'rw'from IPython.display import clear_output
clear_output(wait=True)import ipywidgets as widgets
from ipywidgets import interact, interact_manual
Configure Enviornment
# This will just beautify the output
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# pd.set_option('display.expand_frame_repr', False)
# pd.set_option('display.precision', 2)
# pd.reset_option('max_colwidth')
pd.set_option('max_colwidth', 20)
# pd.reset_option('max_colwidth')
Prep Datasets
TPOP CSA and Baltimore
Get Baltimore
Click to toggle
csa = "https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Tpop/FeatureServer/0/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson"
csa = gpd.read_file(csa);
csa.head(1) Get CSA
url2 = "https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Tpop/FeatureServer/1/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson"
csa2 = gpd.read_file(url2);
csa2['CSA2010'] = csa2['City_1']
csa2['OBJECTID'] = 56
csa2 = csa2.drop(columns=['City_1'])
csa2.head()
Append do no append Bcity. We put it on the Bottom of the df because when performing the ponp it returns only the last matching columns CSA Label.
# csa = pd.concat([csa2, csa], ignore_index=True)
csa = csa.append(csa2).reset_index(drop=True)csa.head(3)csa.tail(3)csa.head()
2018 P1 Crime
Append CSA's name to each record using a Point in Polygons operation
cd p1crimels
Load it
import geopandas as gpd
original18 = gpd.read_file('Part1_2018.shp')
original18 = original18.to_crs(epsg=4326)
Drop Columns
original18.columnsoriginal18.Desc_.unique()
I’m just interested in 2018 vs 2019 for the description (crime type) right now, and the indicator values for the CSAs and city
We are only going to be looking at tthis from an aggregated point of view.
While our columns are all very important and each deserve inspection for quality assurance...
We are just going to look past that part for this exercise.
original18 = original18.drop(columns=['Longitude','Latitude', 'Location1', 'Total_Inci', 'CrimeDate', 'CrimeTime', 'Location', 'Neighborho', 'Post', 'District', 'vri_name1', 'InOut', 'Premise', 'CrimeCode'])
original18.head(1)
Lets take a peek
original18.plot(column='Desc_')
Eeesh. Lets filter out points not near Baltimore for the sake of visibility
gdf18 = original18.copy()
filteredOut18 = original18.copy()
This works
# gdf18 = original18.copy()
# gdf18 = gdf18[ (gdf18.geometry.y < 39.378) == (gdf18.geometry.y > 39.215) ]
# gdf18 = gdf18[ (gdf18.geometry.x < -76.52) == (gdf18.geometry.x > -76.71) ]
# gdf18.plot(column='CrimeCode')
# print( "Originally had {}, and now we have {}, removing {}".format( len(gdf18), len(original18), (len(gdf18) - len(original18) ) ) )
But this next bit essentially does the same thing and more.
Append CSA label w a Points in Polygons operation.
# Get CSA Labels for all Points.
gdf18 = workWithGeometryData(
method='ponp', df=gdf18, polys=csa, ptsCoordCol='geometry',
polygonsCoordCol='geometry', polygonsLabel='CSA2010'
)
gdf18.to_csv('ponp_gdf18.csv', index=False)
Output:
Total Points: 48143.0
Total Points in Polygons: 47748
Prcnt Points in Polygons: 0.9917952765718796
gdf18[gdf18.CSA2010 != 'false'].plot(column='CSA2010')
Dummy encode the description column.
lscd p1crimegdf18New = pd.read_csv('ponp_gdf18.csv')
gdf18New = gdf18New.drop('geometry', axis=1)
# gdf18 = gdf18.drop(['CrimeCode', 'Premise', 'Weapon', 'InOut'], axis=1)gdf18New.columnsgdf18New.CSA2010.unique()gdf18Newdf18.columnsdf18 = pd.get_dummies( gdf18New.loc[:, gdf18New.columns != 'CSA2010'] )
df18['CSA2010'] = gdf18New.CSA2010
df18['gunhom'] = df18.apply(lambda x: 1 if x['Desc__HOMICIDE'] + x['Weapon_FIREARM']== 2 else 0, axis=1 )
df18.head(1)dft18 = df18.groupby('CSA2010').sum(numeric_only=True)
dft18.head(1)dft18 = dft18.reset_index()
dft18.to_csv('aggregated18CrimeDesc.csv')dft18 = pd.read_csv('aggregated18CrimeDesc.csv')
dft18.drop('Unnamed: 0', inplace=True, axis=1)
dft18 = dft18.merge( csa[ ['CSA2010', 'geometry'] ], left_on='CSA2010', right_on='CSA2010' ) import geopandas as gpd
dft18 = gpd.GeoDataFrame( dft18, geometry='geometry' ) dft18.head()dft18.plot(column='Desc__AGG. ASSAULT')
2019 P1 Crime
Append CSA's name to each record using a Point in Polygons operation
ls
Load it
import geopandas as gpd
original19 = gpd.read_file('Part1_2019.shp')
original19 = original19.to_crs(epsg=4326)
Drop Columns
original19.columns
So. We can see that different column names are used.
original19.Descriptio.unique()original19['Desc_'] = original19.Descriptio
I’m just interested in 2018 vs 2019 for the description (crime type) right now, and the indicator values for the CSAs and city
We are only going to be looking at tthis from an aggregated point of view.
While our columns are all very important and each deserve inspection for quality assurance...
We are just going to look past that part for this exercise.
original19 = original19.drop(columns=['CrimeDate', 'CrimeTime', 'CrimeCode', 'Location', 'Descriptio',
'Inside_Out', 'Post', 'District', 'Neighborho', 'Longitude',
'Latitude', 'Location_1', 'Premise', 'vri_name1', 'Total_Inci' ])
original19.head(1)
Lets take a peek
original19.plot(column='Desc_')
Eeesh. Lets filter out points not near Baltimore for the sake of visibility
gdf19 = original19.copy()
filteredOut19 = original19.copy()
Append CSA label w a Points in Polygons operation.
# Get CSA Labels for all Points.
gdf19 = workWithGeometryData(
method='ponp', df=gdf19, polys=csa, ptsCoordCol='geometry',
polygonsCoordCol='geometry', polygonsLabel='CSA2010'
)
gdf19.to_csv('ponp_gdf19.csv', index=False)
Output:
Total Points: 37166.0
Total Points in Polygons: 37018
Prcnt Points in Polygons: 0.9960178657913147
gdf19[gdf19.CSA2010 != 'false'].plot(column='CSA2010')
Or Use Pre-Geocoded Point in Polygon
year = '20'import geopandas as gpd
original = gpd.read_file('Part1_20'+year+'_CSACity.shp')
All the functiosn in here use different column names
original.rename(columns={ 'CSA':'CSA2010', 'BaltCity':'InBaltimore'}, inplace=True)original.head(1)# original.groupby('CSA2010').sum(numeric_only=True)
And convert the CRS
original = original.to_crs(epsg=4326)original.crs
Lets see if we have the columns we need
original.columns
Original Dataset
original.plot()
Remove these for not being either a CSA or Baltimore
removeThese = original[ original['CSA2010'].isnull() & original['InBaltimore'].isnull() ]
removeThese.plot()
Keep These
df = original[ original['CSA2010'].notnull() | original['InBaltimore'].notnull() ]
df.plot()print('After filtering records where a CSA or Baltimore geo-code match Exists')
print( 'All rows Before Filter: ', original.shape[0] ) # rows, columns
print( '# w BCity.isnull: ', df.InBaltimore.isnull().sum() ); bmorow = df[ df.CSA2010.isnull() ].shape[0]
print( '# w CSA2010.isnull: ', bmorow ); csarow = df[ df.CSA2010.notnull() ].shape[0]
print( '# w CSA2010.notnull: ', csarow );
print( '# rows After Filter: ', df.shape[0],'==',csarow,'+',bmorow,'==', csarow + bmorow); # add baltimore city
df.CSA2010 = df.CSA2010.fillna('Baltimore City')df.head(1)df['Desc_'] = df.Descriptio# FOR YS 20
df = df.drop(columns=['CrimeCode', 'Location', 'Descriptio',
'Inside_Out', 'Post', 'District', 'Neighborho', 'Longitude',
'Latitude', 'Premise', 'Total_Inci' ])
df.head(1)# FOR YR 19 18
df = df.drop(columns=['CrimeDate', 'CrimeTime', 'CrimeCode', 'Location', 'Descriptio',
'Inside_Out', 'Post', 'District', 'Neighborho', 'Longitude',
'Latitude', 'Location_1', 'Premise', 'vri_name1', 'Total_Inci' ])
df.head(1)df.tail(3)df.to_csv('ponp_gdf'+year+'.csv', index=False)
NOT USED
# start the count
df['pointsinpolygon'] = 1
df = df.groupby('CSA2010').sum(numeric_only=True)
# Make sure ALL csas and BaltimoreCity are included. among other things
df = csa.merge( df, left_on='CSA2010', right_on='CSA2010', how='outer' )
# Update the baltimore CSA.
df.at[55,'pointsinpolygon'] = df['pointsinpolygon'].sum()
df.tail(2)from shapely import wkt
df = pd.read_csv('ponp_gdf'+year+'.csv')
df['geometry'] = df['geometry'].apply(wkt.loads)
df = gpd.GeoDataFrame(df, crs='epsg:4326')df.plot()
Dummy encode the description column.
gdfNew = pd.read_csv('ponp_gdf'+year+'.csv')
# 20
gdfNew = gdfNew.drop(['geometry', 'RowID_', 'CrimeDateT', 'GeoLocatio', 'InBaltimore', 'VRIName'], axis=1)
# 19 gdfNew = gdfNew.drop(['geometry'], axis=1)
# gdf18 = gdf18.drop(['CrimeCode', 'Premise', 'Weapon', 'InOut'], axis=1)gdfNew.columnsgdfNew.CSA2010.unique()gdfNewdf = pd.get_dummies( gdfNew.loc[:, gdfNew.columns != 'CSA2010'] )
df['CSA2010'] = gdfNew.CSA2010
df['gunhom'] = df.apply(lambda x: 1 if x['Desc__HOMICIDE'] + x['Weapon_FIREARM']== 2 else 0, axis=1 )
df.head(1)dft = df.groupby('CSA2010').sum(numeric_only=True)
dft.head(3)dft = dft.reset_index()
dft.to_csv('aggregated'+year+'CrimeDesc.csv')dft = pd.read_csv('aggregated'+year+'CrimeDesc.csv')dft.head(1)dft = dft.merge( csa[ ['CSA2010', 'geometry'] ], left_on='CSA2010', right_on='CSA2010' ) dft.head()dft.tail()
Create Indicators
Unique Descriptions
"""
dft = pd.read_csv('aggregated'+year+'CrimeDesc.csv')
# Remove the 'False' Records
reapp = dft.loc[55]
dft = dft.drop([55])
dft = dft.append(dft.sum(numeric_only=True), ignore_index=True)
# Reappend the False records
dft = dft.append(reapp)
dft = dft.reset_index()
dft = dft.drop(columns=['Unnamed: 0', 'index'])
dft.loc[55, 'CSA2010'] = "Baltimore City"
dft.tail()"""df = pd.read_csv('aggregated'+year+'CrimeDesc.csv')
dft = df.groupby('CSA2010').sum(numeric_only=True)
# Make sure ALL csas and BaltimoreCity are included. among other things
dft = csa.merge( dft, left_on='CSA2010', right_on='CSA2010', how='outer' )
# Update the baltimore CSA.
dft.at[55] = dft.sum(numeric_only=True)
dft.at[55, 'CSA2010'] = 'Baltimore City'
display( dft.head(2) )
dft.tail(2)dft.columnsdft.tail()# Aggregate by CSA
# Group By CSA so that they may be opperated on
groupedCounts = dft.groupby('CSA2010')
# Aggregate Numeric Values by Sum
groupedCounts = groupedCounts.sum(numeric_only=True)
groupedCounts.tail()
Crime - 50
We dont have a 'Desc Shooting' column so is this accurate?
When weapon = firearm and description = homicide, does that not mean it was a shooting?
original_SQL_crime16 = """
--crime
--/*
* Indicator Number 50/
with tbl AS ( select (sum( case when (descriptio !='ARSON' AND descriptio != 'COMMON ASSAULT' AND descriptio != 'SHOOTING')
then 1 else 0 end)::numeric * 1000 )/the_pop as result, csa from vital_signs.match_csas_and_bc_by_geom('crime.part1_2016', 'gid', 'the_geom') a b.gid )
update vital_signs.data set crime = result from tbl where data.csa = tbl.csa and data_year = '2016';
"""outline_crime16 = """
descriptio !='ARSON' AND descriptio != 'COMMON ASSAULT' AND descriptio != 'SHOOTING'
count * 1000 )/the_pop
"""#export
dft['crime'+year] = (
dft['Desc__AGG. ASSAULT'] +
dft['Desc__AUTO THEFT'] +
dft['Desc__BURGLARY'] +
dft['Desc__HOMICIDE'] +
dft['Desc__LARCENY'] +
dft['Desc__LARCENY FROM AUTO'] +
dft['Desc__RAPE'] +
dft['Desc__ROBBERY - CARJACKING'] +
dft['Desc__ROBBERY - COMMERCIAL'] +
dft['Desc__ROBBERY - RESIDENCE'] +
dft['Desc__ROBBERY - STREET']
) * 1000 / csa['tpop10']
# Just checking I got all but the three. 14 desc cols - 11 being used = 3 not used. Alls good. Vio - 52
original_SQL_viol16 = """
--/* * Indicator Number 52/
with tbl AS ( select (sum( case when
(descriptio LIKE 'ROBBERY - CARJACKING' OR descriptio
LIKE 'ROBBERY - COMMERCIAL' OR descriptio
LIKE 'ROBBERY - RESIDENCE' OR descriptio
LIKE 'ROBBERY - STREET' OR
descriptio LIKE 'AGG. ASSAULT' OR
descriptio LIKE 'RAPE' OR
descriptio LIKE 'HOMICIDE') then 1 else 0 end)::numeric csa * 1000 )/the_pop as result,
from vital_signs.match_csas_and_bc_by_geom('crime.part1_2016', 'gid', 'the_geom') a on a.gid = b.gid )
update vital_signs.data
set viol = result from tbl where
data.csa = tbl.csa and data_year = '2016';
"""outline_viol16 = """(descriptio LIKE 'ROBBERY - CARJACKING' OR descriptio LIKE 'ROBBERY -
COMMERCIAL' OR descriptio LIKE 'ROBBERY - RESIDENCE' OR descriptio LIKE
'ROBBERY - STREET' OR descriptio LIKE 'AGG. ASSAULT' OR descriptio LIKE
'RAPE' OR descriptio LIKE 'HOMICIDE'
count * 1000 )/the_pop
"""#export
dft['viol'+year] = (dft['Desc__ROBBERY - CARJACKING'] +
dft['Desc__ROBBERY - COMMERCIAL'] +
dft['Desc__ROBBERY - RESIDENCE'] +
dft['Desc__ROBBERY - STREET'] +
dft['Desc__AGG. ASSAULT'] +
dft['Desc__RAPE'] +
dft['Desc__HOMICIDE'] ) * 1000 / csa['tpop10']Prop - 53
original_SQL_prop16 = """
--/* * Indicator Number 53/ --
with tbl AS ( left join crime.part1_2016 b group by csa, the_pop left join crime.part1_2016 b on a.gid = group by csa, the_pop select (sum( case
when (
descriptio ='LARCENY' OR
descriptio = 'LARCENY FROM AUTO' OR
descriptio = 'BURGLARY' OR
descriptio = 'AUTO THEFT')
then 1 else 0 end)::numeric * 1000 )/the_pop as result, csa
from vital_signs.match_csas_and_bc_by_geom('crime.part1_2016', 'gid', 'the_geom') a b.gid )
update vital_signs.data set prop = result from tbl where data.csa = tbl.csa and data_year = '2016';
"""outline_prop16 = """
descriptio ='LARCENY' OR descriptio = 'LARCENY FROM AUTO' OR descriptio = 'BURGLARY' OR descriptio = 'AUTO THEFT'
count * 1000 )/the_pop
"""#export
dft['prop'+year] = (dft['Desc__LARCENY'] +
dft['Desc__LARCENY FROM AUTO'] +
dft['Desc__BURGLARY'] +
dft['Desc__AUTO THEFT'] ) * 1000 / csa['tpop10']Gunhom - 59
original_SQL_gunhom16 = """
--/* */ --
with tbl AS ( select (sum( case when (descriptio ='HOMICIDE' AND weapon = 'FIREARM') then 1 else 0
end)::numeric * 1000 )/the_pop as result, csa from vital_signs.match_csas_and_bc_by_geom('crime.part1_2016', 'gid', 'the_geom') a b.gid )
update vital_signs.data set gunhom = result from tbl where data.csa = tbl.csa and data_year = '2016';
left join crime.part1_2016 b on a.gid = group by csa, the_pop left join crime.part1_2016 b on a.gid = group by csa, the_
"""outline_gunhom18 = """
descriptio = descriptio ='HOMICIDE' AND weapon = 'FIREARM'
count * 1000 )/the_pop
"""desc = """homicide < WEAPONG > weapon = firearm ... Shooting comes from a different dataset though."""#export
dft['gunhom'+year] = (dft['gunhom'] ) * 1000 / csa['tpop10']No firearm description but we do have a shooting column. A column is created to deal with this in the pre-processing stage
dft.columns# dft18['gunhom18'] = (dft18['Desc__HOMICIDE'] + dft18['Desc__SHOOTING']) * 1000 / csa['tpop10']WE DONT HAVE A SHOOTING COLUMN
# dft19['gunhom19'] = (dft19['Desc__HOMICIDE'] + dft19['Desc__SHOOTING']) * 1000 / csa['tpop10']Wrap up
dft = dft.drop(columns=['geometry'])dft.head()dft.to_csv('p1crime_'+year+'1_indicators_and_desc_aggregates.csv')dft19 = dft19.add_prefix('19_')
dft18 = dft18.add_prefix('18_')dftFinal = dft19.merge( dft18, left_on='19_CSA2010', right_on='18_CSA2010' ) dftFinal.columnsdftFinal.tail()dftFinal.columns