Skip to content

Commit

Permalink
Merge branch 'master' of github.com:DataKind-SF/datadive_201503_sf-he…
Browse files Browse the repository at this point in the history
…alth-improvement-partnership
  • Loading branch information
bllchmbrs committed Mar 29, 2015
2 parents 16a388e + de33b54 commit e414e5a
Show file tree
Hide file tree
Showing 12 changed files with 897 additions and 1,112 deletions.
73 changes: 73 additions & 0 deletions cluster_by_demo_and_license.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
library(dplyr)
library(reshape2)
library(ggplot2)
library(rgdal)
library(maptools)
options(stringsAsFactors = F)

load('output/crime_census_alcohol.rda')

# check out dimension
summary(crime_census_alcohol)

# transform variables
crime_census_alcohol$med_income = (crime_census_alcohol$med_income - min(crime_census_alcohol$med_income)) /
(max(crime_census_alcohol$med_income) - min(crime_census_alcohol$med_income))
crime_census_alcohol$Unemploy_p = crime_census_alcohol$Unemploy_p / 100

# check out small populations
hist(crime_census_alcohol$Pop2010)
summary(crime_census_alcohol$Pop2010[crime_census_alcohol$Pop2010 < 2000])

# throw out tracts with populations less than 500
crime_census_alcohol = subset(crime_census_alcohol, Pop2010 >= 500)

# k-means clustering
names(crime_census_alcohol)[3:8] = paste('demo', tolower(names(crime_census_alcohol)[3:8]), sep = '_')
census_alcohol = crime_census_alcohol %>%
select(starts_with('demo'), starts_with('license'))

do_kmeans = function(dat, k) {
model = kmeans(dat, k)

kmeans_result = list()
kmeans_result$within = model$tot.withinss
kmeans_result$between = model$betweenss
kmeans_result$cluster = model$cluster

kmeans_result
}

kmeans_results = lapply(1:40, function(x) do_kmeans(census_alcohol, x))

# plot results
kmeans_dists = data.frame(
k = 1:40,
within = sapply(kmeans_results, function(x) x$within),
between = sapply(kmeans_results, function(x) x$between)
)
kmeans_dists_long = melt(kmeans_dists, id = 'k')
kmeans_dists_plot =
ggplot(kmeans_dists_long, aes(x = k, y = value, colour = variable)) +
geom_line()

# 20 appears to be a good number of clusters
crime_census_alcohol$cluster = kmeans_results[[20]]$cluster

# pick alcohol-related crimes
relevant_crimes = 'arson|assault|burglary|disorderly_conduct|driving_under_the_influence|drunkenness|liquor_laws|prostitution|robbery|sex_offenses|vandalism'

# calculate aggregate crime rate
crime_census_alcohol = crime_census_alcohol %>%
select(Tract2010, Pop2010, starts_with('demo'), starts_with('license'), cluster) %>%
cbind(crime_census_alcohol[names(crime_census_alcohol)[grep(relevant_crimes, names(crime_census_alcohol))]])

crime_census_alcohol$agg_crime = rowSums(crime_census_alcohol[names(crime_census_alcohol)[grep('crime', names(crime_census_alcohol))]])

# calculate the variance of crimes within each cluster
crime_per_cluster = crime_census_alcohol %>%
group_by(cluster) %>%
summarise(agg_crime_var = var(agg_crime))

write.csv(crime_census_alcohol, file = 'output/crime_census_alcohol.csv', row.names = F)
write.csv(crime_per_cluster, file = 'output/crime_per_cluster.csv', row.names = F)
107 changes: 107 additions & 0 deletions datasf_crime_pull_preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
"""
Code to pull DataSF.org Crime data and preprocess it on a scheduled basis with Linux crond. UNTESTED. More annotation pending.
"""

import pandas as pd
import os
import urllib2
import numpy as np
import json
import pyproj

def main():
"""
"""
# pull json of Crime... from datasf.org
jsonurl = 'https://data.sfgov.org/api/views/gxxq-x39z/rows.json?accessType=DOWNLOAD'
os.system("wget "+jsonurl+" > out.json")

# read result of wget pull
with open("out.json") as f:
data = f.read()

data = json.loads(data)
columns = [data["meta"]["view"]["columns"][i]['fieldName'].replace(":","") for i in range(len(data["meta"]["view"]["columns"]))]
df = pd.DataFrame(data["data"], columns=columns)

# Merge date and time
for i in ['date', 'time']:
if df[i].dtype == 'datetime64[ns]':
df[i] = [str(x) for x in df[i]]

df['datetime'] = None

df.datetime = [x[0:10]+' '+y for x,y in zip(df.date,df.time)]
del df['date']
del df['time']

df.datetime = pd.to_datetime(df.datetime, format = '%Y-%m-%d %H:%M')

df = df.sort(columns =['incidntnum', 'datetime']).reset_index(drop = True)

# Cut resolution
these = ['UNFOUNDED', 'CLEARED-CONTACT JUVENILE FOR MORE INFO', 'EXCEPTIONAL CLEARANCE']
df = df[~df.resolution.isin(these)].reset_index(drop = True)

# Cut all before 2010
df = df[df.datetime > pd.to_datetime('2010', format = '%Y')].reset_index(drop = True)

# Cut Category (s)
keepthese = ['SUICIDE','SEX OFFENSES, FORCIBLE','ASSAULT','ROBBERY','WEAPON LAWS','DRUG/NARCOTIC',\
'DRUNKENNESS','DRIVING UNDER THE INFLUENCE','DISORDERLY CONDUCT','LIQUOR LAWS',\
'VANDALISM','FAMILY OFFENSES','PROSTITUTION','SEX OFFENSES, NON FORCIBLE','TRESPASS',\
'LOITERING','SUSPICIOUS OCC']

df = df[df.category.isin(keepthese)].reset_index(drop = True)

# Throw out garbage columns
keepthese = ['incidntnum','category','descript','dayofweek','pddistrict','resolution','address','x','y','datetime']
df = df[keepthese]

# add Coarse Category
violence = ['SEX OFFENSES, FORCIBLE', 'SEX OFFENSES, NON FORCIBLE','ASSAULT','ROBBERY','WEAPON LAWS','SUICIDE', 'FAMILY OFFENSES']
vandalism = ['SUSPICIOUS OCC', 'VANDALISM', 'TRESPASS']
drugs = ['DRUG/NARCOTIC']
alcohol = ['LIQUOR LAWS','DRUNKENNESS','DISORDERLY CONDUCT', 'LOITERING']
prostitution = ['PROSTITUTION']
dui = ['DRIVING UNDER THE INFLUENCE']

df['CoarseCategroy'] = None
for i in df.index:
if df.category[i] in violence:
df['CoarseCategroy'][i] = 'violence'
if df.category[i] in vandalism:
df['CoarseCategroy'][i] = 'vandalism'
if df.category[i] in drugs:
df['CoarseCategroy'][i] = 'drugs'
if df.category[i] in alcohol:
df['CoarseCategroy'][i] = 'alcohol'
if df.category[i] in prostitution:
df['CoarseCategroy'][i] = 'prostitution'
if df.category[i] in dui:
df['CoarseCategroy'][i] = 'dui'

# add Coarse descriptor Kris code....





# Call Nick's function with RPy2
isn2004=pyproj.Proj("+proj=lcc +lat_1=38.43333333333333 +lat_2=37.06666666666667 +lat_0=36.5 +lon_0=-120.5 +x_0=2000000 +y_0=500000 +ellps=GRS80 +towgs84=0,0,0,0,0,0,0 +units=m +no_defs")
df.x = df.x.astype(float)
df.y = df.y.astype(float)

tmp = [isn2004(df.x[i],df.y[i]) for i in range(len(df))]
tmp = zip(*tmp)
df["newx"] = tmp[0]
df["newy"] = tmp[1]

return df


if __name__ == '__main__':
main()


Loading

0 comments on commit e414e5a

Please sign in to comment.