Merge branch 'master' of github.com:DataKind-SF/datadive_201503_sf-he…

…alth-improvement-partnership
DataKind-SF · Mar 29, 2015 · e414e5a · e414e5a
2 parents 16a388e + de33b54
commit e414e5a
Show file tree

Hide file tree

Showing 12 changed files with 897 additions and 1,112 deletions.
diff --git a/cluster_by_demo_and_license.R b/cluster_by_demo_and_license.R
@@ -0,0 +1,73 @@
+library(dplyr)
+library(reshape2)
+library(ggplot2)
+library(rgdal)
+library(maptools)
+options(stringsAsFactors = F)
+
+load('output/crime_census_alcohol.rda')
+
+# check out dimension
+summary(crime_census_alcohol)
+
+# transform variables
+crime_census_alcohol$med_income = (crime_census_alcohol$med_income - min(crime_census_alcohol$med_income)) /
+  (max(crime_census_alcohol$med_income) - min(crime_census_alcohol$med_income))
+crime_census_alcohol$Unemploy_p = crime_census_alcohol$Unemploy_p / 100
+
+# check out small populations
+hist(crime_census_alcohol$Pop2010)
+summary(crime_census_alcohol$Pop2010[crime_census_alcohol$Pop2010 < 2000])
+
+# throw out tracts with populations less than 500
+crime_census_alcohol = subset(crime_census_alcohol, Pop2010 >= 500)
+
+# k-means clustering
+names(crime_census_alcohol)[3:8] = paste('demo', tolower(names(crime_census_alcohol)[3:8]), sep = '_')
+census_alcohol = crime_census_alcohol %>%
+  select(starts_with('demo'), starts_with('license'))
+
+do_kmeans = function(dat, k) {
+  model = kmeans(dat, k)
+
+  kmeans_result = list()
+  kmeans_result$within = model$tot.withinss
+  kmeans_result$between = model$betweenss
+  kmeans_result$cluster = model$cluster
+
+  kmeans_result
+}
+
+kmeans_results = lapply(1:40, function(x) do_kmeans(census_alcohol, x))
+
+# plot results
+kmeans_dists = data.frame(
+    k = 1:40,
+    within = sapply(kmeans_results, function(x) x$within),
+    between = sapply(kmeans_results, function(x) x$between)
+  )
+kmeans_dists_long = melt(kmeans_dists, id = 'k')
+kmeans_dists_plot =
+  ggplot(kmeans_dists_long, aes(x = k, y = value, colour = variable)) +
+  geom_line()
+
+# 20 appears to be a good number of clusters
+crime_census_alcohol$cluster = kmeans_results[[20]]$cluster
+
+# pick alcohol-related crimes
+relevant_crimes = 'arson|assault|burglary|disorderly_conduct|driving_under_the_influence|drunkenness|liquor_laws|prostitution|robbery|sex_offenses|vandalism'
+
+# calculate aggregate crime rate
+crime_census_alcohol = crime_census_alcohol %>%
+  select(Tract2010, Pop2010, starts_with('demo'), starts_with('license'), cluster) %>%
+  cbind(crime_census_alcohol[names(crime_census_alcohol)[grep(relevant_crimes, names(crime_census_alcohol))]])
+
+crime_census_alcohol$agg_crime = rowSums(crime_census_alcohol[names(crime_census_alcohol)[grep('crime', names(crime_census_alcohol))]])
+
+# calculate the variance of crimes within each cluster
+crime_per_cluster = crime_census_alcohol %>%
+  group_by(cluster) %>%
+  summarise(agg_crime_var = var(agg_crime))
+
+write.csv(crime_census_alcohol, file = 'output/crime_census_alcohol.csv', row.names = F)
+write.csv(crime_per_cluster, file = 'output/crime_per_cluster.csv', row.names = F)
diff --git a/datasf_crime_pull_preprocess.py b/datasf_crime_pull_preprocess.py
@@ -0,0 +1,107 @@
+"""
+Code to pull DataSF.org Crime data and preprocess it on a scheduled basis with Linux crond.  UNTESTED.  More annotation pending.
+"""
+
+import pandas as pd
+import os
+import urllib2
+import numpy as np
+import json
+import pyproj
+
+def main():
+    """
+
+    """
+    # pull json of Crime... from datasf.org
+    jsonurl  = 'https://data.sfgov.org/api/views/gxxq-x39z/rows.json?accessType=DOWNLOAD'
+    os.system("wget "+jsonurl+" > out.json")
+
+    # read result of wget pull
+    with open("out.json") as f:
+        data = f.read()
+
+    data    = json.loads(data)
+    columns = [data["meta"]["view"]["columns"][i]['fieldName'].replace(":","") for i in range(len(data["meta"]["view"]["columns"]))]
+    df      = pd.DataFrame(data["data"], columns=columns)
+
+    # Merge date and time
+    for i in ['date', 'time']:
+        if df[i].dtype == 'datetime64[ns]':
+            df[i] = [str(x) for x in df[i]]
+
+    df['datetime'] = None
+
+    df.datetime = [x[0:10]+' '+y for x,y in zip(df.date,df.time)]
+    del df['date']
+    del df['time']
+
+    df.datetime = pd.to_datetime(df.datetime, format = '%Y-%m-%d %H:%M')
+
+    df = df.sort(columns =['incidntnum', 'datetime']).reset_index(drop = True)
+
+    # Cut resolution
+    these = ['UNFOUNDED', 'CLEARED-CONTACT JUVENILE FOR MORE INFO', 'EXCEPTIONAL CLEARANCE']
+    df    = df[~df.resolution.isin(these)].reset_index(drop = True)
+
+    # Cut all before 2010
+    df = df[df.datetime > pd.to_datetime('2010', format = '%Y')].reset_index(drop = True)
+
+    # Cut Category (s)
+    keepthese = ['SUICIDE','SEX OFFENSES, FORCIBLE','ASSAULT','ROBBERY','WEAPON LAWS','DRUG/NARCOTIC',\
+                 'DRUNKENNESS','DRIVING UNDER THE INFLUENCE','DISORDERLY CONDUCT','LIQUOR LAWS',\
+                 'VANDALISM','FAMILY OFFENSES','PROSTITUTION','SEX OFFENSES, NON FORCIBLE','TRESPASS',\
+                 'LOITERING','SUSPICIOUS OCC']
+
+    df = df[df.category.isin(keepthese)].reset_index(drop = True)
+
+    # Throw out garbage columns
+    keepthese = ['incidntnum','category','descript','dayofweek','pddistrict','resolution','address','x','y','datetime']
+    df        = df[keepthese]
+
+    # add Coarse Category
+    violence     = ['SEX OFFENSES, FORCIBLE', 'SEX OFFENSES, NON FORCIBLE','ASSAULT','ROBBERY','WEAPON LAWS','SUICIDE', 'FAMILY OFFENSES']
+    vandalism    = ['SUSPICIOUS OCC', 'VANDALISM', 'TRESPASS']
+    drugs        = ['DRUG/NARCOTIC']
+    alcohol      = ['LIQUOR LAWS','DRUNKENNESS','DISORDERLY CONDUCT', 'LOITERING']  
+    prostitution = ['PROSTITUTION']
+    dui          = ['DRIVING UNDER THE INFLUENCE']
+
+    df['CoarseCategroy'] = None
+    for i in df.index:
+        if df.category[i] in violence:
+            df['CoarseCategroy'][i] = 'violence'
+        if df.category[i] in vandalism:
+            df['CoarseCategroy'][i] = 'vandalism'
+        if df.category[i] in drugs:
+            df['CoarseCategroy'][i] = 'drugs'
+        if df.category[i] in alcohol:
+            df['CoarseCategroy'][i] = 'alcohol'
+        if df.category[i] in prostitution:
+            df['CoarseCategroy'][i] = 'prostitution'
+        if df.category[i] in dui:
+            df['CoarseCategroy'][i] = 'dui'
+
+    # add Coarse descriptor Kris code....
+
+
+
+
+
+    # Call Nick's function with RPy2
+    isn2004=pyproj.Proj("+proj=lcc +lat_1=38.43333333333333 +lat_2=37.06666666666667 +lat_0=36.5 +lon_0=-120.5 +x_0=2000000 +y_0=500000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs")
+    df.x = df.x.astype(float)
+    df.y = df.y.astype(float)
+
+    tmp = [isn2004(df.x[i],df.y[i]) for i in range(len(df))]
+    tmp = zip(*tmp)
+    df["newx"] = tmp[0]
+    df["newy"] = tmp[1]
+
+    return df
+
+
+if __name__ == '__main__':
+    main()
+
+