-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' of github.com:DataKind-SF/datadive_201503_sf-he…
…alth-improvement-partnership
- Loading branch information
Showing
12 changed files
with
897 additions
and
1,112 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
library(dplyr) | ||
library(reshape2) | ||
library(ggplot2) | ||
library(rgdal) | ||
library(maptools) | ||
options(stringsAsFactors = F) | ||
|
||
load('output/crime_census_alcohol.rda') | ||
|
||
# check out dimension | ||
summary(crime_census_alcohol) | ||
|
||
# transform variables | ||
crime_census_alcohol$med_income = (crime_census_alcohol$med_income - min(crime_census_alcohol$med_income)) / | ||
(max(crime_census_alcohol$med_income) - min(crime_census_alcohol$med_income)) | ||
crime_census_alcohol$Unemploy_p = crime_census_alcohol$Unemploy_p / 100 | ||
|
||
# check out small populations | ||
hist(crime_census_alcohol$Pop2010) | ||
summary(crime_census_alcohol$Pop2010[crime_census_alcohol$Pop2010 < 2000]) | ||
|
||
# throw out tracts with populations less than 500 | ||
crime_census_alcohol = subset(crime_census_alcohol, Pop2010 >= 500) | ||
|
||
# k-means clustering | ||
names(crime_census_alcohol)[3:8] = paste('demo', tolower(names(crime_census_alcohol)[3:8]), sep = '_') | ||
census_alcohol = crime_census_alcohol %>% | ||
select(starts_with('demo'), starts_with('license')) | ||
|
||
do_kmeans = function(dat, k) { | ||
model = kmeans(dat, k) | ||
|
||
kmeans_result = list() | ||
kmeans_result$within = model$tot.withinss | ||
kmeans_result$between = model$betweenss | ||
kmeans_result$cluster = model$cluster | ||
|
||
kmeans_result | ||
} | ||
|
||
kmeans_results = lapply(1:40, function(x) do_kmeans(census_alcohol, x)) | ||
|
||
# plot results | ||
kmeans_dists = data.frame( | ||
k = 1:40, | ||
within = sapply(kmeans_results, function(x) x$within), | ||
between = sapply(kmeans_results, function(x) x$between) | ||
) | ||
kmeans_dists_long = melt(kmeans_dists, id = 'k') | ||
kmeans_dists_plot = | ||
ggplot(kmeans_dists_long, aes(x = k, y = value, colour = variable)) + | ||
geom_line() | ||
|
||
# 20 appears to be a good number of clusters | ||
crime_census_alcohol$cluster = kmeans_results[[20]]$cluster | ||
|
||
# pick alcohol-related crimes | ||
relevant_crimes = 'arson|assault|burglary|disorderly_conduct|driving_under_the_influence|drunkenness|liquor_laws|prostitution|robbery|sex_offenses|vandalism' | ||
|
||
# calculate aggregate crime rate | ||
crime_census_alcohol = crime_census_alcohol %>% | ||
select(Tract2010, Pop2010, starts_with('demo'), starts_with('license'), cluster) %>% | ||
cbind(crime_census_alcohol[names(crime_census_alcohol)[grep(relevant_crimes, names(crime_census_alcohol))]]) | ||
|
||
crime_census_alcohol$agg_crime = rowSums(crime_census_alcohol[names(crime_census_alcohol)[grep('crime', names(crime_census_alcohol))]]) | ||
|
||
# calculate the variance of crimes within each cluster | ||
crime_per_cluster = crime_census_alcohol %>% | ||
group_by(cluster) %>% | ||
summarise(agg_crime_var = var(agg_crime)) | ||
|
||
write.csv(crime_census_alcohol, file = 'output/crime_census_alcohol.csv', row.names = F) | ||
write.csv(crime_per_cluster, file = 'output/crime_per_cluster.csv', row.names = F) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
""" | ||
Code to pull DataSF.org Crime data and preprocess it on a scheduled basis with Linux crond. UNTESTED. More annotation pending. | ||
""" | ||
|
||
import pandas as pd | ||
import os | ||
import urllib2 | ||
import numpy as np | ||
import json | ||
import pyproj | ||
|
||
def main(): | ||
""" | ||
""" | ||
# pull json of Crime... from datasf.org | ||
jsonurl = 'https://data.sfgov.org/api/views/gxxq-x39z/rows.json?accessType=DOWNLOAD' | ||
os.system("wget "+jsonurl+" > out.json") | ||
|
||
# read result of wget pull | ||
with open("out.json") as f: | ||
data = f.read() | ||
|
||
data = json.loads(data) | ||
columns = [data["meta"]["view"]["columns"][i]['fieldName'].replace(":","") for i in range(len(data["meta"]["view"]["columns"]))] | ||
df = pd.DataFrame(data["data"], columns=columns) | ||
|
||
# Merge date and time | ||
for i in ['date', 'time']: | ||
if df[i].dtype == 'datetime64[ns]': | ||
df[i] = [str(x) for x in df[i]] | ||
|
||
df['datetime'] = None | ||
|
||
df.datetime = [x[0:10]+' '+y for x,y in zip(df.date,df.time)] | ||
del df['date'] | ||
del df['time'] | ||
|
||
df.datetime = pd.to_datetime(df.datetime, format = '%Y-%m-%d %H:%M') | ||
|
||
df = df.sort(columns =['incidntnum', 'datetime']).reset_index(drop = True) | ||
|
||
# Cut resolution | ||
these = ['UNFOUNDED', 'CLEARED-CONTACT JUVENILE FOR MORE INFO', 'EXCEPTIONAL CLEARANCE'] | ||
df = df[~df.resolution.isin(these)].reset_index(drop = True) | ||
|
||
# Cut all before 2010 | ||
df = df[df.datetime > pd.to_datetime('2010', format = '%Y')].reset_index(drop = True) | ||
|
||
# Cut Category (s) | ||
keepthese = ['SUICIDE','SEX OFFENSES, FORCIBLE','ASSAULT','ROBBERY','WEAPON LAWS','DRUG/NARCOTIC',\ | ||
'DRUNKENNESS','DRIVING UNDER THE INFLUENCE','DISORDERLY CONDUCT','LIQUOR LAWS',\ | ||
'VANDALISM','FAMILY OFFENSES','PROSTITUTION','SEX OFFENSES, NON FORCIBLE','TRESPASS',\ | ||
'LOITERING','SUSPICIOUS OCC'] | ||
|
||
df = df[df.category.isin(keepthese)].reset_index(drop = True) | ||
|
||
# Throw out garbage columns | ||
keepthese = ['incidntnum','category','descript','dayofweek','pddistrict','resolution','address','x','y','datetime'] | ||
df = df[keepthese] | ||
|
||
# add Coarse Category | ||
violence = ['SEX OFFENSES, FORCIBLE', 'SEX OFFENSES, NON FORCIBLE','ASSAULT','ROBBERY','WEAPON LAWS','SUICIDE', 'FAMILY OFFENSES'] | ||
vandalism = ['SUSPICIOUS OCC', 'VANDALISM', 'TRESPASS'] | ||
drugs = ['DRUG/NARCOTIC'] | ||
alcohol = ['LIQUOR LAWS','DRUNKENNESS','DISORDERLY CONDUCT', 'LOITERING'] | ||
prostitution = ['PROSTITUTION'] | ||
dui = ['DRIVING UNDER THE INFLUENCE'] | ||
|
||
df['CoarseCategroy'] = None | ||
for i in df.index: | ||
if df.category[i] in violence: | ||
df['CoarseCategroy'][i] = 'violence' | ||
if df.category[i] in vandalism: | ||
df['CoarseCategroy'][i] = 'vandalism' | ||
if df.category[i] in drugs: | ||
df['CoarseCategroy'][i] = 'drugs' | ||
if df.category[i] in alcohol: | ||
df['CoarseCategroy'][i] = 'alcohol' | ||
if df.category[i] in prostitution: | ||
df['CoarseCategroy'][i] = 'prostitution' | ||
if df.category[i] in dui: | ||
df['CoarseCategroy'][i] = 'dui' | ||
|
||
# add Coarse descriptor Kris code.... | ||
|
||
|
||
|
||
|
||
|
||
# Call Nick's function with RPy2 | ||
isn2004=pyproj.Proj("+proj=lcc +lat_1=38.43333333333333 +lat_2=37.06666666666667 +lat_0=36.5 +lon_0=-120.5 +x_0=2000000 +y_0=500000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs") | ||
df.x = df.x.astype(float) | ||
df.y = df.y.astype(float) | ||
|
||
tmp = [isn2004(df.x[i],df.y[i]) for i in range(len(df))] | ||
tmp = zip(*tmp) | ||
df["newx"] = tmp[0] | ||
df["newy"] = tmp[1] | ||
|
||
return df | ||
|
||
|
||
if __name__ == '__main__': | ||
main() | ||
|
||
|
Oops, something went wrong.