Todo:
This colab and more can be found at https://github.com/BNIA/vitalsigns.
In the following example pulls point geodata from a Postgres database.
We will pull the postgres point data in two manners.
This notebook was made to create Vital Signs Indicators from an Info-USA geographic dataset.
Get Baltimore
Get CSA
url2 = "https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Tpop/FeatureServer/1/query?where=1%3D1&outFields=*&returnGeometry=true&f=pgeojson" csa2 = gpd.read_file(url2); csa2['CSA2010'] = csa2['City_1'] csa2['OBJECTID'] = 56 csa2 = csa2.drop(columns=['City_1']) csa2.head()Append do no append Bcity. We put it on the Bottom of the df because when performing the ponp it returns only the last matching columns CSA Label.
# csa = pd.concat([csa2, csa], ignore_index=True) csa = csa.append(csa2).reset_index(drop=True)csa.head(3)csa.tail(3)csa.head()All
permitsAll = permits# Reference: All Points base = csa.plot(color='white', edgecolor='black') permitsAll.plot(ax=base, marker='o', color='green', markersize=5);permits = permitsAll# y < 0 permitsLessThanZero = permits[ permits.geometry.y < 0 ] print('Y<0: ', permitsLessThanZero.size, '\n') permitsLessThanZero.plot()# y > 0 permitsGreaterThanZero = permits[ permits.geometry.y > 0 ] print('Y>0: ', permitsGreaterThanZero.size, '\n') permitsGreaterThanZero.plot();# 0 < y < 38 permitsOver38 = permits[ permits.geometry.y < 38 ] permitsOver38 = permitsOver38[ permitsOver38.geometry.y > 0 ] print('0 < y < 38: ', permitsOver38.size, '\n') permitsOver38.plot();# y > 38 permitsUnder38 = permits[ permits.geometry.y > 38 ] print('Y>38 Less than Zero: ', permitsUnder38.size, '\n') permitsUnder38.plot();If you are using Geopandas, Direct imports only work with geojson and shape files
gdf = gpd.read_file("InfoUSA_2018.shp");gdf.head(5)gdf['prim_naics_short'] = gdf.prim_naics.astype(str).str[:-2].astype(np.int64)# All but 'geometry', prim_naics, prim_sic, 'empl_size', 'X', 'Y' # ['coname', 'empl_rng', 'sales_vol', 'sales_rng', 'psic_dsc', 'scnd_sic1', 'scnd_dsc1', 'scnd_sic2', 'scnd_dsc2', # 'cr_a_score', 'cr_n_score', 'headqtr', 'first_year', 'sq_foot', 'firm_indv', 'fleetsize', 'specialty1', # 'specialty2', 'pnaics_dsc', 'acct_exp', 'ad_exp', 'offsup_exp', 'pay_exp', 'rent_exp', 'tech_exp', 'tele_exp', # 'ins_exp', 'legal_exp', 'pckg_exp', 'pirnt_exp', 'prof_exp', 'templbrexp', 'util_exp']""" gdf.columns """ gdf = gdf.drop(['Status', 'Score', 'Match_type', 'Side', 'Match_addr', 'ARC_Street', 'recorddate', 'recordobs', 'recordobs_', 'recordobs1', 'source', 'address', 'city', 'state', 'zipcode', 'mc_route', 'md_barcode', 'loc_addr', 'loc_city', 'loc_state', 'loc_zip', 'locbarcode', 'loc_route', 'county', 'phn_nbr', 'web_addr', 'last_name', 'first_name', 'ctct_title', 'ctct_prof', 'ctct_gen', 'headqtr', 'ofc_size', 'sq_foot', 'pub_pvt', 'ind_code', 'yellowpage', 'metro_area', 'infousa_id', 'latitude', 'longitude', 'match_code'], axis=1) """ gdf = gdf.drop(['Status', 'Score', 'Match_type', 'Side', 'Match_addr', 'ARC_Street', 'recorddate', 'recordobs', 'recordobs_', 'recordobs1', 'source', 'address', 'city', 'state', 'zipcode', 'mc_route', 'md_barcode', 'loc_addr', 'loc_city', 'loc_state', 'loc_zip', 'locbarcode', 'loc_route', 'county', 'phn_nbr', 'web_addr', 'last_name', 'first_name', 'ctct_title', 'ctct_prof', 'ctct_gen', 'sales_vol', 'sales_rng', 'scnd_sic1', 'scnd_dsc1', 'scnd_sic2', 'scnd_dsc2', 'cr_a_score', 'cr_n_score', 'headqtr', 'ofc_size', 'sq_foot', 'firm_indv', 'pub_pvt', 'fleetsize', 'specialty1', 'specialty2', 'ind_code', 'yellowpage', 'metro_area', 'infousa_id', 'latitude', 'longitude', 'match_code', 'acct_exp', 'ad_exp', 'offsup_exp', 'pay_exp', 'rent_exp', 'tech_exp', 'tele_exp', 'ins_exp', 'legal_exp', 'pckg_exp', 'pirnt_exp', 'prof_exp', 'templbrexp', 'util_exp'], axis=1) gdf.columnsgdf = gdf[ gdf['Y'] > 0 ] gdf = gdf.drop(['X','Y'],axis=1)# Convert to EPSG:4326 gdf = gdf.to_crs(epsg=4326) gdf.crs# Reference: All Points base = csa.plot(color='white', edgecolor='black') gdf.plot(ax=base, marker='o', color='green', markersize=5);# Number of Records gdf.head()# Get CSA Labels for all Points. infoUsaCsa = getPolygonOnPoints(gdf, csa, 'geometry', 'geometry', 'CSA2010' ) infoUsaCsa = infoUsaCsa.drop('geometry',axis=1) infoUsaCsa.head(1)# Get counts of points in polygons. This function returns CSA's with a tally of points within it. infoUsaCsaTotals = getPointsInPolygons(gdf, csa, 'geometry', 'geometry') infoUsaCsaTotals = infoUsaCsaTotals.drop('geometry',axis=1) infoUsaCsaTotals = infoUsaCsaTotals.append({'CSA2010': 'Baltimore City' , 'tpop10' : 620961, 'pointsinpolygon': infoUsaCsaTotals['pointsinpolygon'].sum() } , ignore_index=True) infoUsaCsaTotals['numbus'] = infoUsaCsaTotals['pointsinpolygon'] infoUsaCsaTotals = infoUsaCsaTotals.drop('pointsinpolygon',axis=1) infoUsaCsaTotals.tail()infoUsaCsaTotals.to_csv('numbus18.csv', index=False)https://towardsdatascience.com/interactive-controls-for-jupyter-notebooks-f5c94829aee6
# Points @interact def show_articles_more_than(column= gdfleft.columns ): return gdfleft.plot( column=column, legend=True)# Heatmap @interact def show_articles_more_than(column= gdfleft.columns ): return map_points(gdfleft.head(500), lat_col='Y', lon_col='X', popup=column, zoom_start=11, plot_points=False, pt_radius=15, draw_heatmap=column, heat_map_weights_col=None, heat_map_weights_normalize=True, heat_map_radius=15)# MarkerCluster.ipynb # https://github.com/python-visualization/folium/blob/master/examples/MarkerCluster.ipynb from folium.plugins import MarkerCluster m = folium.Map(location=[39.28759453969165, -76.61278931706487], zoom_start=12) marker_cluster = MarkerCluster().add_to(m) stations = gdfleft.head(1000)['geometry'].apply(lambda p: folium.Marker( location=[p.y,p.x], popup='Add popup text here.', icon=None ).add_to(marker_cluster) ) m6# Interact with specification of arguments @interact def show_articles_more_than(column = country_peripheries.columns ): # gdfleft.columns ): return gpd.overlay(csa, gdfleft.head(), how='difference').plot(alpha=0.5, edgecolor='k', column=column, cmap='magma', legend=True);To simulate that data is sampled at different times we random sample data for n_periods rows of data. Note that the geodata and random sampled data is linked through the feature_id, which is the index of the GeoDataFrame.
periods = 10 datetime_index = pd.date_range('2010', periods=periods, freq='Y') dt_index_epochs = ( datetime_index.astype(int) ).astype('U10') datetime_index# Style each boundry with randomness. for country in gdf.index: df = pd.DataFrame( {'color': np.random.normal(size=periods), 'opacity': [1,2,3,4,5,6,7,8,9,1] }, index=dt_index_epochs ) df = df.cumsum() styledata[country] = df ax = df.plot()df.head()We see that we generated two series of data for each country; one for color and one for opacity. Let's plot them to see what they look like.
max_color, min_color, max_opacity, min_opacity = 0, 0, 0, 0 for country, data in styledata.items(): max_color = max(max_color, data['color'].max()) min_color = min(max_color, data['color'].min()) max_opacity = max(max_color, data['opacity'].max()) max_opacity = min(max_color, data['opacity'].max()) linear.PuRd_09.scale(min_color, max_color)We want to map the column named color to a hex color. To do this we use a normal colormap. To create the colormap, we calculate the maximum and minimum values over all the timeseries. We also need the max/min of the opacity column, so that we can map that column into a range [0,1].
max_color, min_color, max_opacity, min_opacity = 0, 0, 0, 0 for country, data in styledata.items(): max_color = max(max_color, data['color'].max()) min_color = min(max_color, data['color'].min()) max_opacity = max(max_color, data['opacity'].max()) max_opacity = min(max_color, data['opacity'].max())from branca.colormap import linear cmap = linear.PuRd_09.scale(min_color, max_color) def norm(x): return (x - x.min()) / (x.max() - x.min()) for country, data in styledata.items(): data['color'] = data['color'].apply(cmap) data['opacity'] = norm(data['opacity'])styledataFinally we use pd.DataFrame.to_dict() to convert each dataframe into a dictionary, and place each of these in a map from country id to data.
from folium.plugins import TimeSliderChoropleth m = folium.Map([39.28759453969165, -76.61278931706487], zoom_start=12) g = TimeSliderChoropleth( gdf.to_json(), styledict={ str(country): data.to_dict(orient='index') for country, data in styledata.items() } ).add_to(m) mgenerate a GeoSeries containing points
Note that this can be simplified a bit, since geometry is available as an attribute on a GeoDataFrame, and the intersection and difference methods are implemented with the “&” and “-” operators, respectively. For example, the latter could have been expressed simply as boros.geometry - mp.
It’s easy to do things like calculate the fractional area in each borough that are in the holes:
gdf.head()csa.head()gdfleft.head()gdfleft[ gdfleft.coname == 'Us Army Corps Of Engineers' ]gdf = gdfleft.copy()csaUrl = "https://services1.arcgis.com/mVFRs7NF4iFitgbY/ArcGIS/rest/services/Tpop/FeatureServer/0/query?where=1%3D1&objectIds=&time=&geometry=&geometryType=esriGeometryEnvelope&inSR=&spatialRel=esriSpatialRelIntersects&resultType=none&distance=0.0&units=esriSRUnit_Meter&returnGeodetic=false&outFields=tpop10%2C+CSA2010&returnGeometry=true&returnCentroid=false&featureEncoding=esriDefault&multipatchOption=xyFootprint&maxAllowableOffset=&geometryPrecision=&outSR=&datumTransformation=&applyVCSProjection=false&returnIdsOnly=false&returnUniqueIdsOnly=false&returnCountOnly=false&returnExtentOnly=false&returnQueryGeometry=false&returnDistinctValues=false&cacheHint=false&orderByFields=&groupByFieldsForStatistics=&outStatistics=&having=&resultOffset=&resultRecordCount=&returnZ=false&returnM=false&returnExceededLimitFeatures=true&quantizationParameters=&sqlFormat=none&f=pgeojson&token=" csa = gpd.read_file(csaUrl); csa.head()The rate of businesses (both for-profit and non-profit) that are directly related to arts and culture per 1,000 residents. Arts-related businesses are defined as belonging to industries that allow for the consumption and enjoyment of arts and culture.
The following industries are identified by their primary NAICS code: music, literary, and visual arts-related retail/supplies (451140, 451211, 451220); art dealers (453920, 453920); libraries (519120); motion picture and film (521310, 532230); art schools (611610); performing arts (711110, 711120, 711130, 711190); independent artists, writers, and performers (711510); museums (712110); historical sites (712120); and zoos, gardens and nature parks (712130, 712190).
The following industries are identified by their primary SIC codes: designers (152106); art publishers (274101), music, literary, and visual arts-related retail/supplies (393101, 519202, 573608, 573609, 593201, 594201, 594205, 594501, 594520, 594601, 599965, 769969); art galleries, dealers, and consultants (599969, 599988, 599989); photography (722121); calligraphers (733607); embroidery (738942); theatres (783201, 792207); theatrical support (792211, 792212); musical and live entertainment (792903, 792905, 792906, 792908, 792917, 792918, 792927); parks (799951); art and music instruction (804958, 829915, 829919); libraries (823111); museums (841201); arts organizations (841202); zoos (842201); writers (899903); visual artists (899907, 899912); art restoring (899908); and music arrangers and composers (899921).
naicCodes = [451140, 451211, 451220, 453920, 519120, 521310, 532230, 611610, 711110, 711120, 711130, 711190, 711510, 712110, 712120, 712130, 712190] sicCodes = [152106, 274101, 393101, 519202, 573608, 573609, 593201, 594201, 594205, 594501, 594520, 594601, 599965, 769969, 599969, 599988, 599989, 722121, 733607, 738942, 783201, 792207, 792211, 792212, 792903, 792905, 792906, 792908, 792917, 792918, 792927, 799951, 804958, 829915, 829919, 823111, 841201, 841202, 842201, 899903, 899907, 899912, 899908, 899921] artbus = infoUsaCsa[ ( infoUsaCsa['prim_naics_short'].isin( naicCodes ) ) | ( infoUsaCsa.prim_sic.isin( sicCodes ) ) ] # Aggregate Numeric Values by Sum artbus = artbus[ ['CSA2010'] ] artbus['artbusCount'] = 1 artbus = artbus.groupby('CSA2010').sum(numeric_only=True) artbus = artbus.merge( csa[ ['CSA2010','tpop10'] ], left_on='CSA2010', right_on='CSA2010' ) artbus = artbus.append({'CSA2010': 'Baltimore City' , 'tpop10' : 620961, 'artbusCount': artbus['artbusCount'].sum() } , ignore_index=True) # Create the Indicator artbus['artbus'] = artbus['artbusCount'] * 1000 / artbus['tpop10'] artbus.to_csv('artbus18.csv', index=False) artbus.tail() import json def artbus(bounds, df): """ 131 - artbus with tbl AS ( select (sum( case when ((prim_naics::text like any (select * from vital_signs.artbus_naics_vals) or prim_sic::text like any (select * from vital_signs.artbus_sic_vals)) and coname != 'Us Army Corps Of Engineers') then 1 else 0 end)::numeric * 1000 )/the_pop as result, csa from vital_signs.match_csas_and_bc_by_geom('economy.infousa_2016', 'gid', 'the_geom') a left join economy.infousa_2016 b on a.gid = b.gid group by csa, the_pop ) update vital_signs.data set artbus = result from tbl where data.csa = tbl.csa and data_year = '2016'; """ # Filter rows # https://www.naics.com/code-search/?sictrms=art # https://www.naics.com/code-search/?naicstrms=art naicCodes = [451140, 451211, 451220, 453920, 519120, 521310, 532230, 611610, 711110, 711120, 711130, 711190, 711510, 712110, 712120, 712130, 712190] sicCodes = [152106, 274101, 393101, 519202, 573608, 573609, 593201, 594201, 594205, 594501, 594520, 594601, 599965, 769969, 599969, 599988, 599989, 722121, 733607, 738942, 783201, 792207, 792211, 792212, 792903, 792905, 792906, 792908, 792917, 792918, 792927, 799951, 804958, 829915, 829919, 823111, 841201, 841202, 842201, 899903, 899907, 899912, 899908, 899921] # sum rows: increment by 1 if row = () else 0 # (prim_naics: like any [ ] or prim_sic like any []) and coname != 'Us Army Corps Of Engineers') df['prim_naics_short'] = df.prim_naics.astype(str).str[:-2].astype(np.int64) # filtered_df = df[ ( df['prim_naics_short'].isin( naicCodes ) ) | ( df.prim_sic.isin( sicCodes ) ) ] #& df.coname != 'Us Army Corps Of Engineers' ] # Point in Polygons csasWithCounts = getPointsInPolygons(filtered_df, bounds, 'geometry', 'geometry') # Aggregate by CSA # Group By CSA so that they may be opperated on groupedCounts = csasWithCounts.groupby('CSA2010') # Aggregate Numeric Values by Sum groupedCounts = groupedCounts.sum(numeric_only=True) # groupedCounts = groupedCounts.merge(bounds, left_on='CSA2010', right_on='CSA2010') print(groupedCounts.columns) groupedCounts['numOfBusinesses'] = groupedCounts['pointsinpolygon'] groupedCounts = groupedCounts.drop(['pointsinpolygon'], axis=1) # groupedCounts = groupedCounts.append({'CSA2010': 'Baltimore City' , 'tpop10' : 620961, 'numOfBusinesses': groupedCounts['numOfBusinesses'].sum() } , ignore_index=True) print({'CSA2010': 'Baltimore City' , 'tpop10' : 620961, 'numOfBusinesses': groupedCounts['numOfBusinesses'].sum() }) groupedCounts['artbus'] = groupedCounts['numOfBusinesses'] * 1000 / groupedCounts['tpop10'] return groupedCounts artbus_vals = artbus(csaComms, gdfleft) artbus_vals.to_csv('artbus18.csv') artbus_valsThe rate of businesses (both for-profit and non-profit) that are in the creative economy per 1,000 residents. The creative economy is defined as industries that use and support artistic and cultural skillsets to attract and generate capital, knowledge, and information. Arts-based businesses are included in the creative economy. In addition to the industries included in the rate of arts-based businesses indictor, the following industries are identified by their primary NAICS code: Textiles (313220); Commercial Printing (323111, 323113); Book Printers and Publishers (323117, 511130); Print Media (451212, 511110, 511120, 511199, 519110); Motion Picture & Video Production (512110); Music Publishers (512230); Sound Recording (512240); Radio (515112); Architecture (541310, 541320); Interior Design (541410); Graphic Design (541430); Advertising (541810, 541890); and Photography (541921, 541922).
In addition to the industries included in the rate of arts-based businesses indictor, the following industries are identified by their primary SIC code: Print Media (271101, 271102, 271198, 272101, 272102, 272104, 273101, 273198, 596302, 599401);Publishers (273298, 274104, 274105, 874205); Printers (275202, 275202, 275902, 275998); Bookbinders (278902); Radio (483201); Television (483301, 484101, 792205, 824911); Textiles (513122, 594904); Advertising (519917, 731101, 731115, 731305, 731999); Fashion Designers (569901, 594408); Photography (722101, 722113, 722120, 733501, 738401); Graphic Design (733603); Commercial Artists (733604); Website Design (737311); General Media (738301); Interior Design (738902); Restoration (764112); Landscape Design (781030); Motion Picture and Video Support (781205, 781211, 781901); Architecture (871202, 871207, 871209, 874892); and Business Writers (899902).
15 -> empl_size integer,
16 -> empl_size character varying(254),
17 -> empl_size bigint,
Convert Column StringToInt
CREATE OR REPLACE FUNCTION pc_chartoint(chartoconvert character varying) RETURNS integer AS $BODY$ SELECT CASE WHEN trim($1) SIMILAR TO '[0-9]+' THEN CAST(trim($1) AS integer) ELSE NULL END; $BODY$ LANGUAGE 'sql' IMMUTABLE STRICT;
ALTER TABLE economy.infousa_2016 ALTER COLUMN empl_size TYPE integer USING pc_chartoint(empl_size);
import json clear_output(wait=True) def cebus(bounds, df, pop): """ 201 - cebusXX with tbl AS ( select (sum( case when ((prim_naics::text like any (select * from vital_signs.cebus_naics_vals) or prim_sic::text like any (select * from vital_signs.cebus_sic_vals)) and coname != 'Us Army Corps Of Engineers') then 1 else 0 end)::numeric * 1000 )/the_pop as result, csa from vital_signs.match_csas_and_bc_by_geom('economy.infousa_2016', 'gid', 'the_geom') a left join economy.infousa_2016 b on a.gid = b.gid group by csa, the_pop ) update vital_signs.data set cebus = result from tbl where data.csa = tbl.csa and data_year = '2016'; """ # Filter rows # https://www.naics.com/code-search/?sictrms=art # https://www.naics.com/code-search/?naicstrms=art naicCodes = [323111, 323113, 451140, 451211, 451212, 453920, 511110, 511120, 511130, 511199, 512110, 519110, 519120, 541310, 541320, 541410, 541430, 541810, 541890, 541921, 541922, 611610, 711110, 711130, 711190, 711510, 712110, 712120, 712130, 712190, 313220, 323117, 511130, 512230, 512240, 515112 ] sicCodes = [271101, 271102, 271198, 272101, 272102, 272104, 273101, 273198, 596302, 599401, 273298, 274104, 274105, 874205, 275202, 275902, 275998, 278902, 483201, 483301, 484101, 792205, 824911, 513122, 594904, 519917, 731101, 731115, 731305, 731999, 569901, 594408, 722101, 722113, 722120, 733501, 738401, 733603, 733604, 737311, 738301, 738902, 764112, 781030, 781205, 781211, 781901, 871202, 871207, 871209, 874892, 899902, 451220, 521310, 532230, 711120] fromArtbusNaicsNotFoundInCebusNaics = [451220, 521310, 532230, 711120] # sum rows: increment by 1 if row = () else 0 # (prim_naics: like any [ ] or prim_sic like any []) and coname != 'Us Army Corps Of Engineers') df['prim_naics_short'] = df.prim_naics.astype(str).str[:-2].astype(np.int64) filtered_df = df[ ( df.prim_naics_short.isin( naicCodes ) | df.prim_sic.isin( sicCodes ) ) ] #& df.coname != 'Us Army Corps Of Engineers' ] filtered_df.to_csv('cebus_points.csv') # Point in Polygons csasWithCounts = getPointsInPolygons(filtered_df, bounds, 'geometry', 'geometry') # Aggregate by CSA # Group By CSA so that they may be opperated on groupedCounts = csasWithCounts.groupby('CSA2010') # Aggregate Numeric Values by Sum groupedCounts = groupedCounts.sum(numeric_only=True) groupedCounts = groupedCounts.merge(pop, left_on='CSA2010', right_on='CSA2010') groupedCounts['countOfBusinesses'] = groupedCounts['number of points'] groupedCounts['cebus'] = groupedCounts['number of points'] * 1000 / groupedCounts['tpop10'] groupedCounts = groupedCounts.drop(['number of points'], axis=1) groupedCounts.to_csv('cebus.csv', index=False) return groupedCountsnaicCodes = [323111, 323113, 451140, 451211, 451212, 453920, 511110, 511120, 511130, 511199, 512110, 519110, 519120, 541310, 541320, 541410, 541430, 541810, 541890, 541921, 541922, 611610, 711110, 711130, 711190, 711510, 712110, 712120, 712130, 712190, 313220, 323117, 511130, 512230, 512240, 515112 ] sicCodes = [271101, 271102, 271198, 272101, 272102, 272104, 273101, 273198, 596302, 599401, 273298, 274104, 274105, 874205, 275202, 275902, 275998, 278902, 483201, 483301, 484101, 792205, 824911, 513122, 594904, 519917, 731101, 731115, 731305, 731999, 569901, 594408, 722101, 722113, 722120, 733501, 738401, 733603, 733604, 737311, 738301, 738902, 764112, 781030, 781205, 781211, 781901, 871202, 871207, 871209, 874892, 899902] cebus = infoUsaCsa[ ( infoUsaCsa['prim_naics_short'].isin( naicCodes ) ) | ( infoUsaCsa.prim_sic.isin( sicCodes ) ) ] print( cebus.size / len(cebus.columns) ) # Aggregate Numeric Values by Sum cebus = cebus[ ['CSA2010'] ] cebus['cebusCount'] = 1 cebus = cebus.groupby('CSA2010').sum(numeric_only=True) cebus = cebus.merge( csa[ ['CSA2010','tpop10'] ], left_on='CSA2010', right_on='CSA2010' ) cebus = cebus.append({'CSA2010': 'Baltimore City' , 'tpop10' : 620961, 'cebusCount': cebus['cebusCount'].sum() } , ignore_index=True) # Create the Indicator cebus['cebus'] = cebus['cebusCount'] * 1000 / cebus['tpop10'] cebus.to_csv('cebus18.csv', index=False) cebus.tail() population = pd.read_csv('population.csv') csaComms = csa[ ['CSA2010', 'geometry'] ].copy() # csaComms = csaComms.drop('tpop10', axis=1) cebus_vals = cebus(csaComms, gdfleft, population ) cebus_vals2016 -> first_year character varying(254),
2017 -> first_year bigint,
Convert Column StringToInt
CREATE OR REPLACE FUNCTION pc_inttochar(chartoconvert bigint) RETURNS character AS $BODY$ SELECT CASE WHEN 1 = 1 THEN CAST($1 AS character(254)) ELSE NULL END; $BODY$ LANGUAGE 'sql' IMMUTABLE STRICT;
ALTER TABLE economy.infousa_2017 ALTER COLUMN first_year TYPE character varying(254) USING pc_inttochar(first_year);
biz4_SQL = """ 152 - biz4_XX with numerator as ( select sum( case when first_year LIKE '2016' OR first_year LIKE '2015' OR first_year LIKE '2014' OR first_year LIKE '2013' then 1 else 0 end)::numeric as result, csa from vital_signs.match_csas_and_bc_by_geom('economy.infousa_2016', 'gid', 'the_geom') a left join economy.infousa_2016 b on a.gid = b.gid group by csa ), denominator AS ( select (sum( case when csa_present then 1 else NULL end)::numeric ) as result, csa from vital_signs.match_csas_and_bc_by_geom('economy.infousa_2016', 'gid', 'the_geom') a left join economy.infousa_2016 b on a.gid = b.gid group by csa ), tbl AS ( select vital_signs.div_zero (numerator.result, denominator.result)*(100::numeric) as result, numerator.csa from numerator left join denominator on numerator.csa = denominator.csa ) update vital_signs.data set biz4_ = result from tbl where data.csa = tbl.csa and data_year = '2016'; with numerator as ( select sum( case when first_year LIKE '2017' OR first_year LIKE '2016' OR first_year LIKE '2015' OR first_year LIKE '2014' OR first_year LIKE '2013' then 1 else 0 end)::numeric as result, csa from vital_signs.match_csas_and_bc_by_geom('economy.infousa_2017', 'gid', 'the_geom') a left join economy.infousa_2017 b on a.gid = b.gid group by csa ), denominator AS ( select (sum( case when csa_present then 1 else NULL end)::numeric ) as result, csa from vital_signs.match_csas_and_bc_by_geom('economy.infousa_2017', 'gid', 'the_geom') a left join economy.infousa_2017 b on a.gid = b.gid group by csa ), tbl AS ( select vital_signs.div_zero (numerator.result, denominator.result)*(100::numeric) as result, numerator.csa from numerator left join denominator on numerator.csa = denominator.csa ) select * from tbl where 1 = 1 ORDER BY csa ASC; """ Translation = """ """#export # 152 - biz4XX # Filter for small businesses biz4 = infoUsaCsa[ ( infoUsaCsa['first_year'].isin( ['2015', '2016', '2017', '2018'] ) ) ] print('Count: first_year == 2018, 2017, 2016, 2015: ', biz2.size / len(biz2.columns) ) biz4 = biz4[ ['CSA2010'] ] #numerator.to_csv('biz18_numerator_csasWithCounts.csv') biz4['biz4Count'] = 1 #export # Aggregate Numeric Values by Sum biz4 = biz4.groupby('CSA2010').sum(numeric_only=True) biz4 = biz4.merge( csa[ ['CSA2010','tpop10'] ], left_on='CSA2010', right_on='CSA2010' ) biz4 = biz4.append( {'CSA2010': 'Baltimore City' , 'tpop10' : 620961, 'biz4Count': biz4['biz4Count'].mean() }, ignore_index=True) biz4.tail(1)# Create the Indicator biz4['biz4'] = biz4['biz4Count'] / infoUsaCsaTotals['numbus']# Save biz4.to_csv('biz4_18.csv', index=False) biz4.head()2016 -> prim_naics character varying(254),
2017 -> prim_naics bigint,
Convert Column StringToInt
CREATE OR REPLACE FUNCTION pc_inttochar(chartoconvert bigint) RETURNS character AS $BODY$ SELECT CASE WHEN 1 = 1 THEN CAST($1 AS character(254)) ELSE NULL END; $BODY$ LANGUAGE 'sql' IMMUTABLE STRICT;
ALTER TABLE economy.infousa_2017 ALTER COLUMN prim_naics TYPE character varying(254) USING pc_inttochar(prim_naics);
# https://bniajfi.org/indicators/Workforce%20and%20Economic%20Development/neiind/2017 neiind_SQL = """157 - neiindXX with tbl AS ( select (sum( case when prim_naics LIKE '44%' OR prim_naics LIKE '45%' OR prim_naics LIKE '52%' OR prim_naics LIKE '54%' OR prim_naics LIKE '62%' OR prim_naics LIKE '71%' OR prim_naics LIKE '72%' OR prim_naics LIKE '81%' then 1 else 0 end)::numeric(20,2) ) as result, csa from vital_signs.match_csas_and_bc_by_geom('economy.infousa_2016', 'gid', 'the_geom') a left join economy.infousa_2016 b on a.gid = b.gid group by csa, the_pop ) update vital_signs.data set neiind = result from tbl where data.csa = tbl.csa and data_year = '2016'; """ Translation = """ """infoUsaCsa.head()#export # 157 - neiindXX # Filter for small businesses neiind = infoUsaCsa.copy() neiind['naics_extra_short'] = neiind.prim_naics.astype(str).str[:-6].astype(np.int64) neiind = infoUsaCsa[ ( neiind['naics_extra_short'].isin( [44, 45, 52, 54, 62, 71, 72, 81] ) ) ] print('Count of Naics Starting With: 44, 45, 52, 54, 62, 71, 72, 81: ', neiind.size / len(neiind.columns) ) neiind = neiind[ ['CSA2010'] ] #numerator.to_csv('biz18_numerator_csasWithCounts.csv') neiind['neiind'] = 1 #export # Aggregate Numeric Values by Sum neiind = neiind.groupby('CSA2010').sum(numeric_only=True) neiind = neiind.merge( csa[ ['CSA2010','tpop10'] ], left_on='CSA2010', right_on='CSA2010' ) neiind = neiind.append( {'CSA2010': 'Baltimore City' , 'tpop10' : 620961, 'neiind': neiind['neiind'].sum() }, ignore_index=True) neiind.tail(1)# Save neiind.to_csv('neiind18.csv', index=False) neiind.head()2016 -> prim_naics character varying(254),
2017 -> prim_naics bigint,
Convert Column StringToInt
CREATE OR REPLACE FUNCTION pc_inttochar(chartoconvert bigint) RETURNS character AS $BODY$ SELECT CASE WHEN 1 = 1 THEN CAST($1 AS character(254)) ELSE NULL END; $BODY$ LANGUAGE 'sql' IMMUTABLE STRICT;
ALTER TABLE economy.infousa_2017 ALTER COLUMN prim_naics TYPE character varying(254) USING pc_inttochar(prim_naics);
# https://bniajfi.org/indicators/Workforce%20and%20Economic%20Development/neiind/2017 neibus_SQL = """ 158 - neibusXX with tbl AS ( select (sum( case when prim_naics LIKE '44%' OR prim_naics LIKE '45%' OR prim_naics LIKE '52%' OR prim_naics LIKE '54%' OR prim_naics LIKE '62%' OR prim_naics LIKE '71%' OR prim_naics LIKE '72%' OR prim_naics LIKE '81%' then 1 else 0 end)::numeric *1000)/the_pop as result, csa from vital_signs.match_csas_and_bc_by_geom('economy.infousa_2016', 'gid', 'the_geom') a left join economy.infousa_2016 b on a.gid = b.gid group by csa, the_pop ) update vital_signs.data set neibus = result from tbl where data.csa = tbl.csa and data_year = '2016'; """ Translation = """ """infoUsaCsa.head()#export # 158 - neibus # Filter for small businesses neibus = infoUsaCsa.copy() neibus['naics_extra_short'] = neibus.prim_naics.astype(str).str[:-6].astype(np.int64) neibus = infoUsaCsa[ ( neibus['naics_extra_short'].isin( [44, 45, 52, 54, 62, 71, 72, 81] ) ) ] print('Count of Naics Starting With: 44, 45, 52, 54, 62, 71, 72, 81: ', neibus.size / len(neibus.columns) ) neibus = neibus[ ['CSA2010'] ] #numerator.to_csv('biz18_numerator_csasWithCounts.csv') neibus['neibus'] = 1 neibus.head()#export # Aggregate Numeric Values by Sum neibus = neibus.groupby('CSA2010').sum(numeric_only=True) neibus = neibus.merge( csa[ ['CSA2010','tpop10'] ], left_on='CSA2010', right_on='CSA2010' ) neibus = neibus.append( {'CSA2010': 'Baltimore City' , 'tpop10' : 620961, 'neibus': neibus['neibus'].sum() }, ignore_index=True) neibus['neibus'] = neibus['neibus'] * 1000 / neibus['tpop10'] neibus.tail(1)# Save neibus.to_csv('neibus18.csv', index=False) neibus.head()2016 -> prim_naics character varying(254),
2017 -> prim_naics bigint,
Convert Column StringToInt
CREATE OR REPLACE FUNCTION pc_inttochar(chartoconvert bigint) RETURNS character AS $BODY$ SELECT CASE WHEN 1 = 1 THEN CAST($1 AS character(254)) ELSE NULL END; $BODY$ LANGUAGE 'sql' IMMUTABLE STRICT;
ALTER TABLE economy.infousa_2017 ALTER COLUMN prim_naics TYPE character varying(254) USING pc_inttochar(prim_naics);
# https://bniajfi.org/indicators/Workforce%20and%20Economic%20Development/neiemp/2017 neiemp_SQL = """ 159 - neiempXX with tbl AS ( select (sum( case when prim_naics LIKE '44%' OR prim_naics LIKE '45%' OR prim_naics LIKE '52%' OR prim_naics LIKE '54%' OR prim_naics LIKE '62%' OR prim_naics LIKE '71%' OR prim_naics LIKE '72%' OR prim_naics LIKE '81%' then empl_size else 0 end) ) as result, csa from vital_signs.match_csas_and_bc_by_geom('economy.infousa_2016', 'gid', 'the_geom') a left join economy.infousa_2016 b on a.gid = b.gid group by csa, the_pop ) update vital_signs.data set neiemp = result from tbl where data.csa = tbl.csa and data_year = '2016'; """ Translation = """ """infoUsaCsa.head()#export # 159 - neiempXX # Filter for small businesses neiemp = infoUsaCsa.copy() neiemp['naics_extra_short'] = neiemp.prim_naics.astype(str).str[:-6].astype(np.int64) neiemp = infoUsaCsa[ ( neiemp['naics_extra_short'].isin( [44, 45, 52, 54, 62, 71, 72, 81] ) ) ] print('Count of Naics Starting With: 44, 45, 52, 54, 62, 71, 72, 81: ', neiemp.size / len(neiemp.columns) ) #numerator.to_csv('biz18_numerator_csasWithCounts.csv')#export # Aggregate Numeric Values by Sum neiemp = neiemp.groupby('CSA2010')[ ['CSA2010','empl_size'] ].sum(numeric_only=True) neiemp = neiemp.merge( csa[ ['CSA2010','tpop10'] ], left_on='CSA2010', right_on='CSA2010' ) neiemp = neiemp.append( {'CSA2010': 'Baltimore City' , 'tpop10' : 620961, 'empl_size': neiemp['empl_size'].sum() }, ignore_index=True) neiemp['neiemp'] = neiemp['empl_size'] neiemp = neiemp.drop('empl_size', axis=1) neiemp.tail() # Save neiemp.to_csv('neiemp18.csv', index=False) neiemp.head()