-
Notifications
You must be signed in to change notification settings - Fork 0
/
process services.r
117 lines (96 loc) · 5.95 KB
/
process services.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
##
## Process BRC's independent living services in England to display on the winter pressures map
##
library(tidyverse)
library(readxl)
library(stringr)
source("init.r")
# load independent living services to show on map
services = read_excel(file.path(data.dir, services.dir, "Service Database_Export_08 01 2018_ENGLAND_S%40H_TS.xlsx"))
services_se = read_excel(file.path(data.dir, services.dir, "Mapping services 180129.xlsx"), sheet="South East")
# unique(services$`Area Name`)
# keep only areas in England
services = services %>%
filter(`Area Name` %in% c("NORTH", "CENTRAL", "SOUTH EAST", "SOUTH & THE CHANNEL ISLANDS", "LONDON", "Northern (Manvers)", "Harlow", "Crawley", "Telford", "Carlisle", "Isle of Wight", "Warmley")) %>%
filter(`Is This Scheme Inactive?` == "NO") %>%
select(Type = `Type of Service`, Name = `Official Scheme Name`, Category = `Categorisation (Primary Scheme Classification)`,
Staff = `Number of staff currently attached to this service`, Vols = `Number of volunteers currently attached to this service`,
`Hospital 1`, `Hospital 2`, `Hospital 3`, `Hospital 4`,
Postcode = `Location Scheme Postcode`, Location = `Location of Scheme`, Location_hospital = `Location if Scheme in a Hospital`)
services_se = services_se %>%
select(Type = `Type of Service`, Name = `Official Scheme Name`, Category = `Categorisation (Primary Scheme Classification)`,
Staff, Vols, Hospitals = Hospital, Postcode = `Location Scheme Postcode`, Location)
##
## clean postcodes
##
# regular expression to match postcodes (allowing lowercase and unlimited spaces)
# source: https://stackoverflow.com/a/7259020
# see also: page 6 of https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/488478/Bulk_Data_Transfer_-_additional_validation_valid_from_12_November_2015.pdf
postcode_regex = "(([gG][iI][rR] {0,}0[aA]{2})|((([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y]?[0-9][0-9]?)|(([a-pr-uwyzA-PR-UWYZ][0-9][a-hjkstuwA-HJKSTUW])|([a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y][0-9][abehmnprv-yABEHMNPRV-Y]))) {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2}))"
# str_extract("WP0 8XX", postcode_regex) # test whether regex matches fake but well-formed postcodes (it does)
# find anything that looks like a postcode and convert them into separate entries
services = services %>%
mutate(Postcode = str_extract_all(Postcode, postcode_regex)) %>% # convert entries containing multiple postcodes into lists
mutate(Postcode = na_if(Postcode, "character(0)")) %>% # set empty lists to NAs (otherwise unnest() doesn't work properly)
unnest() %>% # convert entries with multiple postcodes into separate rows
mutate(Postcode = toupper(Postcode)) # postcodes should be uppercase
services_se = services_se %>%
mutate(Postcode = str_extract_all(Postcode, postcode_regex)) %>% # convert entries containing multiple postcodes into lists
mutate(Postcode = na_if(Postcode, "character(0)")) %>% # set empty lists to NAs (otherwise unnest() doesn't work properly)
unnest() %>% # convert entries with multiple postcodes into separate rows
mutate(Postcode = toupper(Postcode)) # postcodes should be uppercase
# count number of services with valid postcodes
# sum(!is.na(services$Postcode))
# sum(!is.na(services_se$Postcode))
# some data cleaning
services = services %>%
mutate(`Hospital 1` = na_if(`Hospital 1`, "*NOT APPLICABLE*"),
`Hospital 2` = na_if(`Hospital 2`, "*NOT APPLICABLE*"),
`Hospital 3` = na_if(`Hospital 3`, "*NOT APPLICABLE*"),
`Hospital 4` = na_if(`Hospital 4`, "*NOT APPLICABLE*"))
# combine `Hospital X` columns into a single list
# just make a comma-separated string since we're only going to display this data in the map popups
services = services %>%
# combine the four hospitals columns into a comma-separated list
unite(Hospitals, `Hospital 1`, `Hospital 2`, `Hospital 3`, `Hospital 4`, sep=", ", remove=T) %>%
# replace NAs with spaces
mutate(Hospitals = str_replace_all(Hospitals, "NA", " ")) %>%
# remove empty list items
mutate(Hospitals = str_replace_all(Hospitals, "^\\s,\\s|,\\s{2}", "")) # source: https://stackoverflow.com/a/39358929
# some data cleaning
services$Location_hospital = NULL
services_se = services_se %>%
mutate(Location = na_if(Location, 0))
# append South East services to main services dataframe
services = bind_rows(services, services_se)
#################################################################################
## get coordinates for postcodes
##
postcodes = read_csv(file.path(data.dir, "Postcodes", "National_Statistics_Postcode_Lookup - BRC.csv"),
col_types = cols(
Postcode = col_character(),
Longitude = col_double(),
Latitude = col_double(),
Country = col_character(),
`Output Area` = col_character(),
LSOA = col_character(),
`Local Authority Code` = col_character(),
`Rural or Urban?` = col_character(),
`Rural Urban classification` = col_character(),
IMD = col_integer(),
`Rurality index` = col_double()
))
# the ONS data truncates 7-character postcodes to remove spaces (e.g. CM99 1AB --> CM991AB); get rid of all spaces in both datasets to allow merging
postcodes$Postcode2 = gsub(" ", "", postcodes$Postcode)
services$Postcode2 = gsub(" ", "", services$Postcode)
# merge
services = services %>%
left_join(postcodes, by="Postcode2")
# clean up postcode columns
services$Postcode2 = NULL # don't need the truncated column anymore
services$Postcode.y = NULL
services = rename(services, Postcode = Postcode.x)
##
## save processed services
##
write_csv(services, file.path(data.dir, services.dir, "IL services England.csv"))