diff --git a/README.md b/README.md index 4c7726d..2dfdda7 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,15 @@ # anzsic codes + ANZSIC codes in a standard table format for people who are sick of ABS for sharing data in non-standard formats. + +## Data sources + +| Filename | Source | Downloaded date | +|----------|----------|-------| +| `data/1292.0.55.005_ anzsic 2006 - correspondence tables.xls` | [Data 2](https://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/1292.0.55.0052006?OpenDocument) | 2024-10-23 | +| `data/1292.0.55.002_anzsic 2006 - codes and titles.xls` | [Data 4](https://www.abs.gov.au/AUSSTATS/abs@.nsf/DetailsPage/1292.0.55.0022006?OpenDocument) | 2024-10-23 | + + # Known issues Please note that the 'nfd' (not further defined) classes are missing from the anzsic tables, as repoted by @baslat in https://github.com/asiripanich/anzsic/issues/1. PRs to fix are welcome. diff --git a/data/1292.0.55.002_anzsic 2006 - codes and titles.xls b/data/1292.0.55.002_anzsic 2006 - codes and titles.xls new file mode 100644 index 0000000..daaadcc Binary files /dev/null and b/data/1292.0.55.002_anzsic 2006 - codes and titles.xls differ diff --git a/requirements.txt b/requirements.txt index bff55aa..255032a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ polars==1.10.0 pyarrow==17.0.0 -fastexcel==0.12.0 \ No newline at end of file +fastexcel==0.12.0 +papermill==2.6.0 \ No newline at end of file diff --git a/tidy.ipynb b/tidy.ipynb new file mode 100644 index 0000000..76a7455 --- /dev/null +++ b/tidy.ipynb @@ -0,0 +1,139 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl\n", + "import polars.selectors as cs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def read_anzsic_sheet(sheet_name, prefix):\n", + " df = (\n", + " pl.read_excel(\n", + " r\"data/1292.0.55.002_anzsic 2006 - codes and titles.xls\",\n", + " sheet_name=sheet_name,\n", + " drop_empty_rows=True,\n", + " )\n", + " .drop(cs.by_index(0))\n", + " .select(cs.by_index(-2, -1))\n", + " .with_columns(pl.all().fill_null(strategy=\"forward\"))\n", + " .drop_nulls()\n", + " )\n", + " df.columns = [f\"{prefix}_code\", f\"{prefix}_title\"]\n", + " df = df.unique(f\"{prefix}_title\", keep=\"first\", maintain_order=True).sort(\n", + " f\"{prefix}_code\"\n", + " )\n", + " if prefix != \"division\":\n", + " df = df.with_columns(\n", + " pl.col(f\"{prefix}_code\").str.replace(r\".$\", \"\").alias(f\"{prefix}_parent\")\n", + " )\n", + " return df\n", + "\n", + "\n", + "anzsic06 = {\n", + " sheet_name: read_anzsic_sheet(sheet_name, prefix)\n", + " for sheet_name, prefix in zip(\n", + " [\"Divisions\", \"Groups\", \"Classes\"], [\"division\", \"group\", \"class\"]\n", + " )\n", + "}\n", + "\n", + "anzsic06" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create Subdivisions\n", + "anzsic06[\"Subdivisions\"] = pl.read_excel(\n", + " r\"data/1292.0.55.002_anzsic 2006 - codes and titles.xls\",\n", + " sheet_name=\"Subdivisions\",\n", + " drop_empty_rows=True,\n", + ").drop(cs.by_index(0))\n", + "\n", + "anzsic06[\"Subdivisions\"] = (\n", + " anzsic06[\"Subdivisions\"]\n", + " .with_columns(pl.all().fill_null(strategy=\"forward\"))\n", + " .drop_nulls()\n", + ")\n", + "\n", + "anzsic06[\"Subdivisions\"].columns = [\n", + " \"division_code\",\n", + " \"subdivision_code\",\n", + " \"subdivision_title\",\n", + "]\n", + "\n", + "anzsic06[\"Subdivisions\"] = (\n", + " anzsic06[\"Subdivisions\"]\n", + " .unique(\"subdivision_title\")\n", + " .sort([\"division_code\", \"subdivision_code\"])\n", + ")\n", + "\n", + "anzsic06[\"Subdivisions\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "anzsic06_combined = (\n", + " anzsic06[\"Divisions\"]\n", + " .join(anzsic06[\"Subdivisions\"], on=\"division_code\", how=\"full\")\n", + " .join(\n", + " anzsic06[\"Groups\"],\n", + " left_on=\"subdivision_code\",\n", + " right_on=\"group_parent\",\n", + " how=\"full\",\n", + " )\n", + " .join(\n", + " anzsic06[\"Classes\"], left_on=\"group_code\", right_on=\"class_parent\", how=\"full\"\n", + " )\n", + " .drop(cs.ends_with(\"parent\"), cs.ends_with(\"_right\"))\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "anzsic06_combined.write_csv(\"anzsic06.csv\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "anzsic", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tidy_anzsic.py b/tidy_anzsic.py deleted file mode 100644 index 9f114b8..0000000 --- a/tidy_anzsic.py +++ /dev/null @@ -1,2 +0,0 @@ -import polars as pl -