forked from probsys/hierarchical-irm
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #132 from probcomp/080124-thomaswc-pclean_data
Add PClean datasets and schema and run pclean/pclean on them in the integration test
- Loading branch information
Showing
10 changed files
with
53,514 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# PClean schema for flights_dirty.csv | ||
# Based on https://github.com/probcomp/PClean/blob/master/experiments/flights/run.jl | ||
|
||
class TrackingWebsite | ||
name ~ stringcat(strings="aa airtravelcenter allegiantair boston businesstravellogue CO den dfw flightarrival flightaware flightexplorer flights flightstats flightview flightwise flylouisville flytecomm foxbusiness gofox helloflight iad ifly mco mia myrateplan mytripandmore orbitz ord panynj phl quicktrip sfo src travelocity ua usatoday weather world-flight-tracker wunderground") | ||
|
||
class Time | ||
time ~ string | ||
|
||
class Flight | ||
flight_id ~ string | ||
# These are all abbreviations for "scheduled/actual arrival/depature time" | ||
sdt ~ Time | ||
sat ~ Time | ||
adt ~ Time | ||
aat ~ Time | ||
|
||
class Obs | ||
flight ~ Flight | ||
src ~ TrackingWebsite | ||
|
||
observe | ||
src.name as src | ||
flight.flight_id as flight | ||
flight.sdt.time as sched_dep_time | ||
flight.sat.time as sched_arr_time | ||
flight.adt.time as act_dep_time | ||
flight.aat.time as act_arr_time | ||
from Obs |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
# PClean schema for hospital_dirty.csv | ||
# Based on https://github.com/probcomp/PClean/blob/master/experiments/hospital/run.jl | ||
|
||
class County | ||
state ~ stringcat(strings="AL AK AZ AR CA CO CT DE DC FL GA HI ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY") | ||
county ~ string | ||
|
||
class Place | ||
county ~ County | ||
city ~ string | ||
|
||
class Condition | ||
desc ~ string | ||
|
||
class Measure | ||
code ~ stringcat(strings="ami-1 ami-2 ami-3 ami-4 ami-5 ami-7a ami-8a ami-x amix1 amix2 amx-3 amx-4 axi-2 axi-4 cac-1 cac-2 cac-3 hf-1 hf-2 hf-3 hf-4 hf-x hfx1 hfx4 hx-1 hx-2 pn-2 pn-3b pn-4 pn-5c pn-6 pn-7 pn-x pnx5c pnx6 pn-xb px-4 px-5c scip-card-2 scip-inf-1 scip-inf-2 scip-inf-3 scip-inf-4 scip-inf-6 scip-inx-4 scip-vte-1 scip-vte-2 scip-vtx-1 scipxinfx1 scix-inf-2 scxp-xnf-3 sxip-vte-1 xax-1 xf-1") | ||
name ~ string # TODO(thomaswc): Consider using stringcat instead. | ||
condition ~ Condition | ||
|
||
class HospitalType | ||
desc ~ string | ||
|
||
class Hospital | ||
loc ~ Place | ||
type ~ HospitalType | ||
# TODO(thomaswc): Add joint type for "numbers corrupted by typos" | ||
provider ~ stringcat(strings="10001 10005 10006 10007 10008 10009 10010 10011 10012 10015 10016 10018 10019 10021 10022 10023 10024 10025 10027 10029 10032 10033 10034 10035 10036 10038 10039 10040 10043 10044 10045 10046 10047 10049 10050 10055 10056 10085 10086 10087 10108 10158 10164 20017 20018") | ||
name ~ string | ||
addr ~ string | ||
phone ~ string | ||
owner ~ stringcat(strings="government - federal:government - hospital district or authority:government - local:government - state:government - federal:proprietary:voluntary non-profit - church:voluntary non-profit - other:voluntary non-profit - private", delim=":") | ||
zip ~ stringcat(strings="35007 35045 35058 35121 35150 35205 35233 35235 35555 35570 35594 35609 35631 35640 35653 35660 35801 35901 35903 35957 35960 35968 35976 36033 36049 36067 36078 36106 36116 36201 36278 36302 36305 36323 36330 36360 36420 36467 36617 36784 36801 36854 99508 99559") | ||
service ~ stringcat(strings="no yes") | ||
|
||
class Record | ||
hosp ~ Hospital | ||
metric ~ Measure | ||
|
||
observe | ||
# TODO(thomaswc): Model Score, Sample and Stateavg | ||
hosp.provider as ProviderNumber | ||
hosp.name as HospitalName | ||
hosp.addr as Address1 | ||
hosp.loc.city as City | ||
hosp.loc.county.state as State | ||
hosp.zip as ZipCode | ||
hosp.loc.county.county as County | ||
hosp.phone as PhoneNumber | ||
hosp.type.desc as HospitalType | ||
hosp.owner as HospitalOwner | ||
hosp.service as EmergencyService | ||
metric.condition.desc as Condition | ||
metric.code as MeasureCode | ||
metric.name as MeasureName | ||
from Record |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# PClean schema for rents_dirty.csv | ||
# Based on https://github.com/probcomp/PClean/blob/master/experiments/rents/run.jl | ||
|
||
class County | ||
name ~ string | ||
state ~ stringcat(strings="AL AK AZ AR CA CO CT DE DC FL GA HI ID IL IN IA KS KY LA ME MD MA MI MN MS MO MT NE NV NH NJ NM NY NC ND OH OK OR PA RI SC SD TN TX UT VT VA WA WV WI WY") | ||
|
||
class Obs | ||
county ~ County | ||
br ~ stringcat(strings="1br 2br 3br 4br studio") | ||
rent ~ real | ||
|
||
observe | ||
county.name as County | ||
county.state as State | ||
br as "Room Type" | ||
rent as "Monthly Rent" | ||
from Obs |
Oops, something went wrong.