Skip to content

Commit

Permalink
feat: add a ipynb that creates anzsic06.csv
Browse files Browse the repository at this point in the history
  • Loading branch information
asiripanich committed Oct 23, 2024
1 parent 618ec70 commit bf384ae
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 3 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# anzsic codes

ANZSIC codes in a standard table format for people who are sick of ABS for sharing data in non-standard formats.


## Data sources

| Filename | Source | Downloaded date |
|----------|----------|-------|
| `data/1292.0.55.005_ anzsic 2006 - correspondence tables.xls` | [Data 2](https://www.abs.gov.au/AUSSTATS/[email protected]/DetailsPage/1292.0.55.0052006?OpenDocument) | 2024-10-23 |
| `data/1292.0.55.002_anzsic 2006 - codes and titles.xls` | [Data 4](https://www.abs.gov.au/AUSSTATS/[email protected]/DetailsPage/1292.0.55.0022006?OpenDocument) | 2024-10-23 |


# Known issues
Please note that the 'nfd' (not further defined) classes are missing from the anzsic tables, as repoted by @baslat in https://github.com/asiripanich/anzsic/issues/1. PRs to fix are welcome.
Binary file not shown.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
polars==1.10.0
pyarrow==17.0.0
fastexcel==0.12.0
fastexcel==0.12.0
papermill==2.6.0
139 changes: 139 additions & 0 deletions tidy.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"import polars.selectors as cs"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def read_anzsic_sheet(sheet_name, prefix):\n",
" df = (\n",
" pl.read_excel(\n",
" r\"data/1292.0.55.002_anzsic 2006 - codes and titles.xls\",\n",
" sheet_name=sheet_name,\n",
" drop_empty_rows=True,\n",
" )\n",
" .drop(cs.by_index(0))\n",
" .select(cs.by_index(-2, -1))\n",
" .with_columns(pl.all().fill_null(strategy=\"forward\"))\n",
" .drop_nulls()\n",
" )\n",
" df.columns = [f\"{prefix}_code\", f\"{prefix}_title\"]\n",
" df = df.unique(f\"{prefix}_title\", keep=\"first\", maintain_order=True).sort(\n",
" f\"{prefix}_code\"\n",
" )\n",
" if prefix != \"division\":\n",
" df = df.with_columns(\n",
" pl.col(f\"{prefix}_code\").str.replace(r\".$\", \"\").alias(f\"{prefix}_parent\")\n",
" )\n",
" return df\n",
"\n",
"\n",
"anzsic06 = {\n",
" sheet_name: read_anzsic_sheet(sheet_name, prefix)\n",
" for sheet_name, prefix in zip(\n",
" [\"Divisions\", \"Groups\", \"Classes\"], [\"division\", \"group\", \"class\"]\n",
" )\n",
"}\n",
"\n",
"anzsic06"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create Subdivisions\n",
"anzsic06[\"Subdivisions\"] = pl.read_excel(\n",
" r\"data/1292.0.55.002_anzsic 2006 - codes and titles.xls\",\n",
" sheet_name=\"Subdivisions\",\n",
" drop_empty_rows=True,\n",
").drop(cs.by_index(0))\n",
"\n",
"anzsic06[\"Subdivisions\"] = (\n",
" anzsic06[\"Subdivisions\"]\n",
" .with_columns(pl.all().fill_null(strategy=\"forward\"))\n",
" .drop_nulls()\n",
")\n",
"\n",
"anzsic06[\"Subdivisions\"].columns = [\n",
" \"division_code\",\n",
" \"subdivision_code\",\n",
" \"subdivision_title\",\n",
"]\n",
"\n",
"anzsic06[\"Subdivisions\"] = (\n",
" anzsic06[\"Subdivisions\"]\n",
" .unique(\"subdivision_title\")\n",
" .sort([\"division_code\", \"subdivision_code\"])\n",
")\n",
"\n",
"anzsic06[\"Subdivisions\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"anzsic06_combined = (\n",
" anzsic06[\"Divisions\"]\n",
" .join(anzsic06[\"Subdivisions\"], on=\"division_code\", how=\"full\")\n",
" .join(\n",
" anzsic06[\"Groups\"],\n",
" left_on=\"subdivision_code\",\n",
" right_on=\"group_parent\",\n",
" how=\"full\",\n",
" )\n",
" .join(\n",
" anzsic06[\"Classes\"], left_on=\"group_code\", right_on=\"class_parent\", how=\"full\"\n",
" )\n",
" .drop(cs.ends_with(\"parent\"), cs.ends_with(\"_right\"))\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"anzsic06_combined.write_csv(\"anzsic06.csv\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "anzsic",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 0 additions & 2 deletions tidy_anzsic.py

This file was deleted.

0 comments on commit bf384ae

Please sign in to comment.