Skip to content

Commit

Permalink
Merge pull request #121 from VIDA-NYU/compact-archive
Browse files Browse the repository at this point in the history
Compact archive
  • Loading branch information
heikomuller authored Apr 27, 2021
2 parents da6c501 + ff8bd89 commit 25a2c0f
Show file tree
Hide file tree
Showing 74 changed files with 2,701 additions and 2,772 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,11 @@

* Make checking out a committed dataset in the `openclean.data.archive.base.ArchiveStore` optional.
* Enable cache refresh for cached datasets in `openclean.data.archive.cache.CachedDatastore`.


### 0.4.0 - TBD

* Use compact serialization for HISTORE archives.
* Load and sample datasets from a data stream in `openclean.engine.base.OpencleanEngine`.
* Support stream operators on dataset snapshots in `openclean.engine.base.OpencleanEngine`.
* Add summary for data frame conflict groups.
4 changes: 2 additions & 2 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ dill
pandas>=1.0.0
scikit-learn
jsonschema>=3.2.0
histore==0.3.1
histore>=0.4.0
flowserv-core>=0.8.0
jellyfish
refdata>=0.2.0
Expand All @@ -18,4 +18,4 @@ sphinxcontrib-apidoc
jupyter-sphinx
git+https://github.com/spatialaudio/nbsphinx.git@master
nbsphinx-link
sphinxcontrib-spelling
sphinxcontrib-spelling
15 changes: 2 additions & 13 deletions examples/notebooks/Country Names.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -44,33 +44,22 @@
"3 Algeria DZ DZA Algiers Africa Northern Africa\n",
"4 American Samoa AS ASM Pago Pago Oceania Polynesia\n"
]
},
{
"data": {
"text/plain": [
"<Snapshot (version=0 description='' at=2021-02-26 11:06:39.568311-05:00)>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Download the current listing of country names in the world.\n",
"\n",
"from openclean.data.refdata import RefStore\n",
"\n",
"import openclean.data.archive as masterdata\n",
"import openclean.data.archive.base as masterdata\n",
"\n",
"refstore = RefStore(basedir=datadir)\n",
"refstore.download('restcountries.eu')\n",
"countries = refstore.load('restcountries.eu').df()\n",
"\n",
"print(countries.head())\n",
"\n",
"archive = masterdata.create('restcountries', primary_key=['alpha3Code'], replace=True)\n",
"archive.commit(countries)"
"archive = masterdata.create('restcountries', source=countries, primary_key=['alpha3Code'], replace=True)"
]
},
{
Expand Down
29 changes: 10 additions & 19 deletions examples/notebooks/Masterdata.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
"source": [
"# Import the amasterdata and reference data modules.\n",
"\n",
"import openclean.data.archive as masterdata\n",
"import openclean.data.archive.base as masterdata\n",
"import openclean.data.refdata as refdata"
]
},
Expand All @@ -48,10 +48,13 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Company Suffixes (company_suffixes)\n",
"Cities in the U.S. (encyclopaedia_britannica:us_cities)\n",
"NYC Finance - State Codes (nyc.gov:dof:state_codes)\n",
"REST Countries (restcountries.eu)\n",
"C1 Street Suffix Abbreviations (usps:street_abbrev)\n",
"C2 Secondary Unit Designators (usps:secondary_unit_designators)\n"
"C2 Secondary Unit Designators (usps:secondary_unit_designators)\n",
"States and territories of the U.S. (wikipedia:us_states)\n"
]
}
],
Expand Down Expand Up @@ -191,23 +194,11 @@
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Snapshot (version=0 description='' at=2021-02-26 11:07:34.902079-05:00)>"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"# Create a local masterdata archive for the countries dataset.\n",
"\n",
"archive = masterdata.create('restcountries', primary_key=['alpha3Code'], replace=True)\n",
"archive.commit(countries)"
"archive = masterdata.create('restcountries', source=countries, primary_key=['alpha3Code'], replace=True)"
]
},
{
Expand Down Expand Up @@ -387,7 +378,7 @@
{
"data": {
"text/plain": [
"<Snapshot (version=1 description='' at=2021-02-26 11:07:35.063135-05:00)>"
"<Snapshot (version=1 description='' at=2021-04-21 07:24:23.441467-04:00)>"
]
},
"execution_count": 10,
Expand Down Expand Up @@ -475,8 +466,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"<Snapshot (version=0 description='' at=2021-02-26 11:07:34.902079-05:00)>\n",
"<Snapshot (version=1 description='' at=2021-02-26 11:07:35.063135-05:00)>\n"
"<Snapshot (version=0 description='' at=2021-04-21 07:24:23.308028-04:00)>\n",
"<Snapshot (version=1 description='' at=2021-04-21 07:24:23.441467-04:00)>\n"
]
}
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
"\n",
"from openclean.pipeline import stream\n",
"\n",
"df = stream(datafile).select('City ').update('City ', str.upper).sample(10000, seed=42).to_df()"
"df = stream(datafile).select('City ').update('City ', str.upper).sample(10000, random_state=42).to_df()"
]
},
{
Expand All @@ -47,17 +47,17 @@
{
"data": {
"text/plain": [
"NEW YORK 3680\n",
"BROOKLYN 1594\n",
"QUEENS 538\n",
"BRONX 470\n",
"NY 462\n",
" ... \n",
"FAR ROCKAWY 1\n",
"OLD BRIDGE 1\n",
"MARLTON 1\n",
"WILLIAMSBURG 1\n",
"NANUET 1\n",
"NEW YORK 3680\n",
"BROOKLYN 1594\n",
"QUEENS 538\n",
"BRONX 470\n",
"NY 462\n",
" ... \n",
"BROOKLLYN 1\n",
"CAMBRIA HEIGHTS 1\n",
"STUART 1\n",
"BRONXVILLE 1\n",
"BROOKLYM 1\n",
"Name: City , Length: 513, dtype: int64"
]
},
Expand Down Expand Up @@ -119,50 +119,53 @@
"name": "stdout",
"output_type": "stream",
"text": [
"4\tLIC.\n",
"4\tL.I.C.\n",
"\tSI,NY\n",
"\tN Y\n",
"\tB'KLYN\n",
"\t_BK\n",
"\tQUEEN S\n",
"\tN.Y.\n",
"\tNEW YORK\n",
"\tS.I.,NY\n",
"\tL.I.CITY\n",
"\tSUITE 2107 NY\n",
"\tNEW CANAAN\n",
"\tHOLLIS HILLS\n",
"\t_BK\n",
"\tS.I.\n",
"\tMIAMI\n",
"\tSUITE 2107 NY\n",
"\tL.I.C\n",
"\tFLUSHING MEADOW\n",
"\tNEW YORK\n",
"\tS.I.,NY\n",
"\tMIAMI\n",
"\tLIC.\n",
"\tBKLYN.\n",
"\tL.I.C.\n",
"\tN.Y.\n",
"\tSI,NY\n",
"\tB'KLYN\n",
"\tQUEEN S\n",
"\n",
"3\tROCKVILLE_CENTR\n",
"3\tLONG ISLN. CITY\n",
"\tNEW YOURK\n",
"\tS.OZONE PARK\n",
"\tJACKSON HTS.\n",
"\tS. OZONE PARK\n",
"\tBOONTON\n",
"\tMT. KISCO\n",
"\tATLANTA\n",
"\tRICHMOND-HILL\n",
"\tNEW YORK\\\n",
"\tS. RICHMOND HIL\n",
"\tHOLLIS HILLS\n",
"\tNEW CANAAN\n",
"\tLONG ISL.CITY\n",
"\tNEW YORK,\n",
"\tMANHATTAN\n",
"\tROCKVILLE_CENTR\n",
"\tMINEOLA,\n",
"\tNEW YORK CITY\n",
"\tBROOKLYN,\n",
"\tN.MIAMI BEACH\n",
"\tQUEENS _VILLAGE\n",
"\tFLUS. MEADOWS\n",
"\tPHILADELPHIA\n",
"\tCINCINNATI\n",
"\tLONG ISL. CITY\n",
"\tNEW YOURK\n",
"\tLONG ISLN. CITY\n",
"\tSO. PLAINFIELD\n",
"\tMC LEAN\n",
"\tRICHMOND-HILL\n",
"\tS. OZONE PARK\n",
"\tLONG ISL. CITY\n",
"\tS. PLAINFIELD\n",
"\tFLUSHING MEADOW\n",
"\tJACKSON HTS.\n",
"\tST. PETERSBURG\n",
"\tBROOKLYN,\n",
"\tNEW YORK CITY\n",
"\tNEW YORK, NY\n",
"\tPHILADELPHIA\n",
"\tMT.VERNON\n",
"\tNEW YORK\\\n",
"\tSO. OZONE PARK\n",
"\tNEW YORK, NY\n"
"\tMT. KISCO\n"
]
}
],
Expand Down Expand Up @@ -200,18 +203,18 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.9.1"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -271,18 +271,18 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.9.1"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -162,20 +162,13 @@
" \n",
"print('\\n{} rows.'.format(df.count()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"display_name": "Python 3",
"language": "python",
"name": ".venv"
"name": "python3"
},
"language_info": {
"codemirror_mode": {
Expand All @@ -187,18 +180,18 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
"version": "3.9.1"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"source": [],
"metadata": {
"collapsed": false
}
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
}
Loading

0 comments on commit 25a2c0f

Please sign in to comment.