Merge pull request #27 from UI-Research/web-scraping-workshop

Re-rendered intro to pandas
UI-Research · Apr 26, 2024 · 69e70e7 · 69e70e7
2 parents 31b4540 + 494e20c
commit 69e70e7
Show file tree

Hide file tree

Showing 233 changed files with 56,118 additions and 7,330 deletions.
diff --git a/misc-resources/web-scraping/workshop_2024/Reference.qmd b/misc-resources/web-scraping/workshop_2024/Reference.qmd
@@ -0,0 +1,70 @@
+---
+title: "Reference File"
+format: html
+editor: visual
+---
+
+## State Zip Code Data
+
+```{r}
+# script to clean zip/state csv file for
+# use in web scraping 
+library(here)
+library(tidyverse)
+library(jsonlite)
+library(tidylog)
+
+# read in data
+# Read in the file zip-state.csv using the here package
+file_path = here("misc-resources", "web-scraping", "workshop_2024", "data")
+zip_data <- read_csv(here(file_path, "zip-state.csv")) %>%
+	# filter out zctas
+	filter(str_detect(zipcode, "^[0-9]+$"),
+				 !is.na(city)) %>% 
+	# get one zip code per county
+	distinct(state, county, .keep_all = TRUE) %>% 
+	mutate(state_abbr = str_to_lower(state_abbr)) %>% 
+	# select just state abbreviation and zip codes
+	select(state_abbr, county, zipcode)
+
+# limit to small sample for workshop 
+set.seed(1)
+zip_data_small <- zip_data %>% 
+	sample_n(10)
+
+write_csv(zip_data_small, here(file_path, "zip_data_small.csv"))
+```
+
+## Nested Dictionary
+
+```{python}
+import json
+import pandas as pd
+
+# Read in csv of states, counties, and zip codes (one zip per county)
+ # Assuming you have a CSV file with columns 'State' and 'ZIP Code'
+csv_file_path = 'misc-resources/web-scraping/workshop_2024/data/zip_data_small.csv'
+
+# read in csv
+raw_csv = pd.read_csv(csv_file_path, dtype={'zipcode': str})
+# Convert csv to json
+raw_json = raw_csv.to_json(orient='records')
+
+# Create a nested dictionary
+state_counties_zipcodes = {}
+
+for index, row in raw_csv.iterrows():
+    state = row['state_abbr']
+    county = row['county']
+    zipcode = row['zipcode']
+
+    if state not in state_counties_zipcodes:
+        state_counties_zipcodes[state] = {}
+
+    state_counties_zipcodes[state][county] = zipcode
+
+# Save out to json file
+with open('misc-resources/web-scraping/workshop_2024/data/zip_data_small.json', 'w') as f:
+    json.dump(state_counties_zipcodes, f, indent=4)
+
+```
diff --git a/misc-resources/web-scraping/workshop_2024/data/output.json b/misc-resources/web-scraping/workshop_2024/data/output.json
@@ -0,0 +1,62 @@
+{
+  "Bourbon": [
+    248.0,
+    315.0,
+    415.0,
+    881.0
+  ],
+  "Marion": [
+    341.0,
+    433.0,
+    570.0,
+    1211.0
+  ],
+  "Kane": [
+    249.0,
+    316.0,
+    416.0,
+    883.0
+  ],
+  "Pawnee": [
+    297.0,
+    377.0,
+    496.0,
+    1054.0
+  ],
+  "Harvey": [
+    277.0,
+    351.0,
+    462.0,
+    982.0
+  ],
+  "Buchanan": [
+    351.0,
+    445.0,
+    586.0,
+    1245.0
+  ],
+  "Dooly": [
+    248.0,
+    314.0,
+    414.0,
+    879.0
+  ],
+  "Abbeville": [
+    252.0,
+    319.0,
+    420.0,
+    892.0
+  ],
+  "Kit Carson": [
+    370.0,
+    469.0,
+    618.0,
+    1311.0
+  ],
+  "Wicomico": [
+    207.0,
+    262.0,
+    346.0,
+    734.0
+  ]
+}