Merge branch 'master' of https://github.com/DataKind-SF/datadive_2015…

…03_sf-health-improvement-partnership
DataKind-SF · Mar 29, 2015 · 561e4ab · 561e4ab
2 parents 3ec0fef + 0f82980
commit 561e4ab
Show file tree

Hide file tree

Showing 5 changed files with 152 additions and 69 deletions.
diff --git a/.gitignore b/.gitignore
@@ -12,8 +12,9 @@
 vignettes/*.html
 vignettes/*.pdf
 
-# Data folders
-data
+## Data folders
+#data
+#processed_data
 
 # Map files
 *.dbf

diff --git a/datasf_crime_pull_preprocess.py b/datasf_crime_pull_preprocess.py
@@ -46,48 +46,63 @@ def main():
     df = df.sort(columns=['incidntnum', 'datetime']).reset_index(drop=True)
 
     # Cut resolution
-    these = ['UNFOUNDED', 'CLEARED-CONTACT JUVENILE FOR MORE INFO', 'EXCEPTIONAL CLEARANCE']
-    df = df[~df.resolution.isin(these)].reset_index(drop=True)
+    unrelated_res = ['UNFOUNDED', 'CLEARED-CONTACT JUVENILE FOR MORE INFO', 'EXCEPTIONAL CLEARANCE']
+    df = df[~df.resolution.isin(unrelated_res)].reset_index(drop=True)
 
     # Cut all before 2010
     df = df[df.datetime > pd.to_datetime('2010', format='%Y')].reset_index(drop=True)
 
     # Cut Category (s)
-    keepthese = ['SUICIDE', 'SEX OFFENSES, FORCIBLE', 'ASSAULT', 'ROBBERY', 'WEAPON LAWS', 'DRUG/NARCOTIC',
+    related_cat = ['SUICIDE', 'SEX OFFENSES, FORCIBLE', 'ASSAULT', 'ROBBERY', 'WEAPON LAWS', 'DRUG/NARCOTIC',
                  'DRUNKENNESS', 'DRIVING UNDER THE INFLUENCE', 'DISORDERLY CONDUCT', 'LIQUOR LAWS',
                  'VANDALISM', 'FAMILY OFFENSES', 'PROSTITUTION', 'SEX OFFENSES, NON FORCIBLE', 'TRESPASS',
                  'LOITERING', 'SUSPICIOUS OCC']
 
-    df = df[df.category.isin(keepthese)].reset_index(drop=True)
+    df = df[df.category.isin(related_cat)].reset_index(drop=True)
 
     # Throw out garbage columns
-    keepthese = ['incidntnum', 'category', 'descript', 'dayofweek', 'pddistrict', 'resolution', 'address', 'x', 'y',
+    relevant_param = ['incidntnum', 'category', 'descript', 'dayofweek', 'pddistrict', 'resolution', 'address', 'x', 'y',
                  'datetime']
-    df = df[keepthese]
+    df = df[relevant_param]
 
     # add Coarse Category
-    violence = ['SEX OFFENSES, FORCIBLE', 'SEX OFFENSES, NON FORCIBLE', 'ASSAULT', 'ROBBERY', 'WEAPON LAWS', 'SUICIDE',
-                'FAMILY OFFENSES']
-    vandalism = ['SUSPICIOUS OCC', 'VANDALISM', 'TRESPASS']
-    drugs = ['DRUG/NARCOTIC']
-    alcohol = ['LIQUOR LAWS', 'DRUNKENNESS', 'DISORDERLY CONDUCT', 'LOITERING']
-    prostitution = ['PROSTITUTION']
-    dui = ['DRIVING UNDER THE INFLUENCE']
-
-    df['CoarseCategroy'] = None
-    for i in df.index:
-        if df.category[i] in violence:
-            df['CoarseCategroy'][i] = 'violence'
-        if df.category[i] in vandalism:
-            df['CoarseCategroy'][i] = 'vandalism'
-        if df.category[i] in drugs:
-            df['CoarseCategroy'][i] = 'drugs'
-        if df.category[i] in alcohol:
-            df['CoarseCategroy'][i] = 'alcohol'
-        if df.category[i] in prostitution:
-            df['CoarseCategroy'][i] = 'prostitution'
-        if df.category[i] in dui:
-            df['CoarseCategroy'][i] = 'dui'
+    # violence = ['SEX OFFENSES, FORCIBLE', 'SEX OFFENSES, NON FORCIBLE', 'ASSAULT', 'ROBBERY', 'WEAPON LAWS', 'SUICIDE',
+    #             'FAMILY OFFENSES']
+    # vandalism = ['SUSPICIOUS OCC', 'VANDALISM', 'TRESPASS']
+    # drugs = ['DRUG/NARCOTIC']
+    # alcohol = ['LIQUOR LAWS', 'DRUNKENNESS', 'DISORDERLY CONDUCT', 'LOITERING']
+    # prostitution = ['PROSTITUTION']
+    # dui = ['DRIVING UNDER THE INFLUENCE']
+    crimes = {
+            "violence": ['SEX OFFENSES, FORCIBLE', 'SEX OFFENSES, NON FORCIBLE', 'ASSAULT', 'ROBBERY', 'WEAPON LAWS', 'SUICIDE',
+                         'FAMILY OFFENSES'],
+             "vandalism": ['SUSPICIOUS OCC', 'VANDALISM', 'TRESPASS'],
+             "drugs": ['DRUG/NARCOTIC'],
+             "alcohol": ['LIQUOR LAWS', 'DRUNKENNESS', 'DISORDERLY CONDUCT', 'LOITERING'],
+             "prostitution": ['PROSTITUTION'],
+             "dui": ['DRIVING UNDER THE INFLUENCE']}
+
+    for crime in crimes:
+        for crime_type in crimes[crime]:
+            df.CoarseCategory[df.category == crime_type] = crime
+
+    # for i in violence:
+    #     df.CoarseCategory[df.category == i] = 'violence'
+    #
+    # df['CoarseCategroy'] = None
+    # for i in df.index:
+    #     if df.category[i] in violence:
+    #         df['CoarseCategroy'][i] = 'violence'
+    #     if df.category[i] in vandalism:
+    #         df['CoarseCategroy'][i] = 'vandalism'
+    #     if df.category[i] in drugs:
+    #         df['CoarseCategroy'][i] = 'drugs'
+    #     if df.category[i] in alcohol:
+    #         df['CoarseCategroy'][i] = 'alcohol'
+    #     if df.category[i] in prostitution:
+    #         df['CoarseCategroy'][i] = 'prostitution'
+    #     if df.category[i] in dui:
+    #         df['CoarseCategroy'][i] = 'dui'
 
     # add Coarse descriptor Kris code....
 
@@ -106,7 +121,7 @@ def main():
     df["newx"] = tmp[0]
     df["newy"] = tmp[1]
 
-    return df
+    df.to_csv("final_data.csv", index=False)
 
 
 if __name__ == '__main__':

diff --git a/sfhip-app/runapp.R b/sfhip-app/runapp.R
@@ -0,0 +1,32 @@
+rm(list = ls())
+# (Run the following lines if this is the first time you are using runapp.R.)
+# pkgs <- c("shiny", "devtools",
+#           "dplyr", "ggplot2", "ggmap", "scales", "grid")
+# install.packages(pkgs, repos = "http://cran.r-project.org")
+# sapply(pkgs, library, character.only = T)
+# devtools::install_github("rstudio/shinydashboard")
+# devtools::install_github("rstudio/shinyapps")
+
+# To view app in browser
+library(shiny)
+work_dir <- "~/Copy/sfhip"
+setwd(work_dir)
+runApp(getwd(), port = 1234)
+
+# To deploy app to web
+setwd("~/Copy/sfhip-app")
+library(shinyapps)
+### CHANGE!!!
+name <- "username"
+token <- "shinytoken"
+secret <- "shinysecret"
+shinyapps::setAccountInfo(name = name, token = token, secret = secret)
+# sessionInfo()
+options(shinyapps.http.trace = TRUE) # for log to trace error
+deployApp()
+Y
+shinyapps::showLogs()
+#----------------------------------------------------------------------
+# End
+#----------------------------------------------------------------------
+
diff --git a/sfhip_statistical_analysis.Rmd b/sfhip_statistical_analysis.Rmd
@@ -55,44 +55,79 @@ date: "March 29, 2015"
 output: pdf_document
 ---
 
-# Introduction ---
-
-- what this is: a collection of initial statistical insights from a data dive
-- Not a refined product, some lines of thought could be more developed, but wanted
-to share everything we thought of. 
-- Avoid math, focus on plots
-
-## Goals ---
-
-- Global patterns and local anomalies
-    + What is the general relationship between crime and liquor store levels?
-    + Are there positions that deviate from these local anamolies substantially?
-    + How can demographic information be integrated? Can we match?
-- Are there particularly bad types of liquor establishments?
-- What is the effect of different total sales volume within different tracts, 
-and within individual stores?
-
-## Approach --
-
-- The data:
-  + We use alcohol license information, census tract information, and crime information; haven't used 311 reported data
-- Aggregate data to census level, divide by total population
-  + Ignores store effects
-  + Ignores temporal effects
-  + Throw out tracts with fewer than 500 people, estimation too hard
-- Aggregation using License Type and Crime Category
-  + Could replicate analsis at different levels of granularity
-  + Coarser, and no real need to do dimension reduction, but perhaps less rich picture
-  + Finer, and the level of reduction would be more challenging: Working in higher
-  dimensional space.
-- Remainder is just dimension reduction followed by some sort of association
-  + Crudest dimension reduction is just summing counts
-  + Another crude dimension reduction is ignoring dimensions
-  + Once we cluster based on two sets of vars, see how the clusterings compare
-  + See how the distances compare
-  + Perform dimension reduction jointly
-
-# Regression on Aggregated Counts ---
+# Introduction
+
+In this document, we collect the statistical approaches and insights from DataKind's 
+March 27-29, 2015 DataDive. While we highlight our most interesting findings, we
+also wanted to share alternative lines of thought we had developed but not completely
+refined, with the hope that this could guide future analysis.
+
+## Goals
+
+There are two overall goals of interest in this analysis: Global patterns and local
+anomalies. On the one hand, we would like to summarise the relationships between
+the presence of liquor stores and crimes that will apply generally across
+San Francisco. On the other hand, we would like to highlight those locations
+which somehow seem to deviate from any general patterns. In both cases, we would
+like to integrate demographic information. For example, are there locations with
+similar demograhpics and numbers of liquor stores, but very different crime rates?
+
+## Approach
+
+### Data Available
+
+We consider three primary sources of data:
+- Census information: Demographic information at the census tract level. This
+includes overall population, population breakdown across races, unemployment
+rate, and median income.
+- Crime data: We have crime reports (from the SFPD?), mapping the time and place
+of crimes within the city over the last 10 years. These crime reports also include
+descriptions of the type of crime at varying levels of granularity -- a report
+may be classified at a coarse level as robbery, and at a fine level as robbery
+at an ATM machine, for instance.
+- Alcohol license data: We have records of alcohol licenses over more than a decade.
+These licenses are required by any venue that sells or distributes alcohol, including
+bars, clubs, and convenience stores. These records include the location of these
+vendors, as well as a license type (bars and liquor stores require different 
+licenses, for example).
+
+### Data used
+
+We chose to aggregate the crime and alcohol license data to the census tract 
+level, and then normalize by census tract population. More specifically, we 
+(1) counted the number of venues using each of the 23 license types within each census
+tract, then divided by the population of that census tract and (2) counted
+the number of crimes within each of the 30 description groups. We could have
+used finer or coarser description types for both liquor vendors and 
+crimes, but this level seemed to offer a rich description without making the
+problem too high dimensional, and less tractable. Further, we discarded those tracts
+with fewer than 500 people living within them, since
+our estimates of the densities of crimes and liquor venues in such sparsely
+populated areas are less reliable. 
+
+Notice that, at this stage, we have ignored (1) any spatial information at a finer resolution than the census tract level and (2) any
+temporal effects. Nonetheless, we believe our methods could be generalized to handle
+these situations as well.
+
+### Methods
+
+For the first task, identifying global patterns, our overal methods use two
+steps: dimension reduction followed by some measure of association. By dimension
+reduction, we mean reducing many different measurements to just a few -- for example,
+the number of college educated people and the median income of a census tract can
+both be explained by an underlying "affluence" effect. By association, we mean
+taking these underlying factors and determining whether and how they are correlated.
+
+The specific tools we applied to do this reduction vary in complexity. From
+crudest to most (but still not very) sophisticated, we used
+
+-Crudest dimension reduction is just summing counts
+- Another crude dimension reduction is ignoring dimensions
+- Once we cluster based on two sets of vars, see how the clusterings compare
+- See how the distances compare
+- Perform dimension reduction jointly
+
+# Regression on Aggregated Counts
 
 - First, just summing counts of licenses
 - We need to compare rates

diff --git a/sfhip_statistical_analysis.pdf b/sfhip_statistical_analysis.pdf