Skip to content

Commit

Permalink
feat: add code for creating anzsic_1993.csv and upload
Browse files Browse the repository at this point in the history
  • Loading branch information
asiripanich committed Oct 24, 2024
1 parent 36da51d commit ee2e463
Show file tree
Hide file tree
Showing 2 changed files with 179 additions and 2 deletions.
8 changes: 7 additions & 1 deletion .github/workflows/tidy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,13 @@ jobs:
- name: Run notebook
run: papermill tidy.ipynb tidy-rendered.ipynb

- name: Upload CSV artifact
- name: Upload anzsic_1993.csv
uses: actions/upload-artifact@v4
with:
name: anzsic_1993.csv
path: anzsic_1993.csv

- name: Upload anzsic_2006.csv
uses: actions/upload-artifact@v4
with:
name: anzsic_2006.csv
Expand Down
173 changes: 172 additions & 1 deletion tidy.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,185 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"import polars as pl\n",
"import polars.selectors as cs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Create `anzsic_1993.csv`"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"anzsic93_raw = (\n",
" pl.read_excel(\n",
" r\"data/1292.0.55.005_ anzsic 2006 - correspondence tables.xls\",\n",
" sheet_name=\"Table 1\",\n",
" drop_empty_rows=True,\n",
" )\n",
" .with_row_index()\n",
" .filter(~pl.col(\"index\").is_between(0, 2))\n",
" .drop(~cs.by_index(0, 1, 2, 3))\n",
")\n",
"\n",
"anzsic93_raw.columns = [\"index\", \"parent\", \"code\", \"title\"]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"def tidy_anzsic93(df, name, code_len):\n",
" df_final = df.filter(pl.col(\"parent\").str.len_chars() == code_len).drop(\n",
" cs.by_index(-1)\n",
" )\n",
" df_final.columns = [\"index\", \"code\", \"title\"]\n",
" df_final = df_final.sort(\"index\").rename(\n",
" lambda colname: name[:-1].lower() + \"_\" + colname\n",
" )\n",
" return df_final\n",
"\n",
"\n",
"anzsic93 = {}\n",
"\n",
"for index, name in enumerate([\"Divisions\", \"Subdivisions\", \"Groups\"]):\n",
" anzsic93[name] = tidy_anzsic93(anzsic93_raw, name, index + 1)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (465, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>division_code</th><th>division_title</th><th>subdivision_code</th><th>subdivision_title</th><th>group_code</th><th>group_title</th><th>class_code</th><th>class_title</th></tr><tr><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;A&quot;</td><td>&quot;Agriculture, Forestry and Fish…</td><td>&quot;01&quot;</td><td>&quot;Agriculture&quot;</td><td>&quot;011&quot;</td><td>&quot;Horticulture and Fruit Growing&quot;</td><td>&quot;0111&quot;</td><td>&quot;Plant Nurseries&quot;</td></tr><tr><td>&quot;A&quot;</td><td>&quot;Agriculture, Forestry and Fish…</td><td>&quot;01&quot;</td><td>&quot;Agriculture&quot;</td><td>&quot;011&quot;</td><td>&quot;Horticulture and Fruit Growing&quot;</td><td>&quot;0112&quot;</td><td>&quot;Cut Flower and Flower Seed Gro…</td></tr><tr><td>&quot;A&quot;</td><td>&quot;Agriculture, Forestry and Fish…</td><td>&quot;01&quot;</td><td>&quot;Agriculture&quot;</td><td>&quot;011&quot;</td><td>&quot;Horticulture and Fruit Growing&quot;</td><td>&quot;0113&quot;</td><td>&quot;Vegetable Growing&quot;</td></tr><tr><td>&quot;A&quot;</td><td>&quot;Agriculture, Forestry and Fish…</td><td>&quot;01&quot;</td><td>&quot;Agriculture&quot;</td><td>&quot;011&quot;</td><td>&quot;Horticulture and Fruit Growing&quot;</td><td>&quot;0114&quot;</td><td>&quot;Grape Growing&quot;</td></tr><tr><td>&quot;A&quot;</td><td>&quot;Agriculture, Forestry and Fish…</td><td>&quot;01&quot;</td><td>&quot;Agriculture&quot;</td><td>&quot;011&quot;</td><td>&quot;Horticulture and Fruit Growing&quot;</td><td>&quot;0115&quot;</td><td>&quot;Apple and Pear Growing&quot;</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;Q&quot;</td><td>&quot;Personal and Other Services&quot;</td><td>&quot;96&quot;</td><td>&quot;Other Services&quot;</td><td>&quot;963&quot;</td><td>&quot;Public Order and Safety Servic…</td><td>&quot;9631&quot;</td><td>&quot;Police Services&quot;</td></tr><tr><td>&quot;Q&quot;</td><td>&quot;Personal and Other Services&quot;</td><td>&quot;96&quot;</td><td>&quot;Other Services&quot;</td><td>&quot;963&quot;</td><td>&quot;Public Order and Safety Servic…</td><td>&quot;9632&quot;</td><td>&quot;Corrective Centres&quot;</td></tr><tr><td>&quot;Q&quot;</td><td>&quot;Personal and Other Services&quot;</td><td>&quot;96&quot;</td><td>&quot;Other Services&quot;</td><td>&quot;963&quot;</td><td>&quot;Public Order and Safety Servic…</td><td>&quot;9633&quot;</td><td>&quot;Fire Brigade Services&quot;</td></tr><tr><td>&quot;Q&quot;</td><td>&quot;Personal and Other Services&quot;</td><td>&quot;96&quot;</td><td>&quot;Other Services&quot;</td><td>&quot;963&quot;</td><td>&quot;Public Order and Safety Servic…</td><td>&quot;9634&quot;</td><td>&quot;Waste Disposal Services&quot;</td></tr><tr><td>&quot;Q&quot;</td><td>&quot;Personal and Other Services&quot;</td><td>&quot;97&quot;</td><td>&quot;Private Households Employing S…</td><td>&quot;970&quot;</td><td>&quot;Private Households Employing S…</td><td>&quot;9700&quot;</td><td>&quot;Private Households Employing S…</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (465, 8)\n",
"┌────────────┬────────────┬────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐\n",
"│ division_c ┆ division_t ┆ subdivisio ┆ subdivisi ┆ group_cod ┆ group_tit ┆ class_cod ┆ class_tit │\n",
"│ ode ┆ itle ┆ n_code ┆ on_title ┆ e ┆ le ┆ e ┆ le │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ str ┆ str ┆ str ┆ str ┆ str ┆ str │\n",
"╞════════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡\n",
"│ A ┆ Agricultur ┆ 01 ┆ Agricultu ┆ 011 ┆ Horticult ┆ 0111 ┆ Plant │\n",
"│ ┆ e, ┆ ┆ re ┆ ┆ ure and ┆ ┆ Nurseries │\n",
"│ ┆ Forestry ┆ ┆ ┆ ┆ Fruit ┆ ┆ │\n",
"│ ┆ and Fish… ┆ ┆ ┆ ┆ Growing ┆ ┆ │\n",
"│ A ┆ Agricultur ┆ 01 ┆ Agricultu ┆ 011 ┆ Horticult ┆ 0112 ┆ Cut │\n",
"│ ┆ e, ┆ ┆ re ┆ ┆ ure and ┆ ┆ Flower │\n",
"│ ┆ Forestry ┆ ┆ ┆ ┆ Fruit ┆ ┆ and │\n",
"│ ┆ and Fish… ┆ ┆ ┆ ┆ Growing ┆ ┆ Flower │\n",
"│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ Seed Gro… │\n",
"│ A ┆ Agricultur ┆ 01 ┆ Agricultu ┆ 011 ┆ Horticult ┆ 0113 ┆ Vegetable │\n",
"│ ┆ e, ┆ ┆ re ┆ ┆ ure and ┆ ┆ Growing │\n",
"│ ┆ Forestry ┆ ┆ ┆ ┆ Fruit ┆ ┆ │\n",
"│ ┆ and Fish… ┆ ┆ ┆ ┆ Growing ┆ ┆ │\n",
"│ A ┆ Agricultur ┆ 01 ┆ Agricultu ┆ 011 ┆ Horticult ┆ 0114 ┆ Grape │\n",
"│ ┆ e, ┆ ┆ re ┆ ┆ ure and ┆ ┆ Growing │\n",
"│ ┆ Forestry ┆ ┆ ┆ ┆ Fruit ┆ ┆ │\n",
"│ ┆ and Fish… ┆ ┆ ┆ ┆ Growing ┆ ┆ │\n",
"│ A ┆ Agricultur ┆ 01 ┆ Agricultu ┆ 011 ┆ Horticult ┆ 0115 ┆ Apple and │\n",
"│ ┆ e, ┆ ┆ re ┆ ┆ ure and ┆ ┆ Pear │\n",
"│ ┆ Forestry ┆ ┆ ┆ ┆ Fruit ┆ ┆ Growing │\n",
"│ ┆ and Fish… ┆ ┆ ┆ ┆ Growing ┆ ┆ │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ Q ┆ Personal ┆ 96 ┆ Other ┆ 963 ┆ Public ┆ 9631 ┆ Police │\n",
"│ ┆ and Other ┆ ┆ Services ┆ ┆ Order and ┆ ┆ Services │\n",
"│ ┆ Services ┆ ┆ ┆ ┆ Safety ┆ ┆ │\n",
"│ ┆ ┆ ┆ ┆ ┆ Servic… ┆ ┆ │\n",
"│ Q ┆ Personal ┆ 96 ┆ Other ┆ 963 ┆ Public ┆ 9632 ┆ Correctiv │\n",
"│ ┆ and Other ┆ ┆ Services ┆ ┆ Order and ┆ ┆ e Centres │\n",
"│ ┆ Services ┆ ┆ ┆ ┆ Safety ┆ ┆ │\n",
"│ ┆ ┆ ┆ ┆ ┆ Servic… ┆ ┆ │\n",
"│ Q ┆ Personal ┆ 96 ┆ Other ┆ 963 ┆ Public ┆ 9633 ┆ Fire │\n",
"│ ┆ and Other ┆ ┆ Services ┆ ┆ Order and ┆ ┆ Brigade │\n",
"│ ┆ Services ┆ ┆ ┆ ┆ Safety ┆ ┆ Services │\n",
"│ ┆ ┆ ┆ ┆ ┆ Servic… ┆ ┆ │\n",
"│ Q ┆ Personal ┆ 96 ┆ Other ┆ 963 ┆ Public ┆ 9634 ┆ Waste │\n",
"│ ┆ and Other ┆ ┆ Services ┆ ┆ Order and ┆ ┆ Disposal │\n",
"│ ┆ Services ┆ ┆ ┆ ┆ Safety ┆ ┆ Services │\n",
"│ ┆ ┆ ┆ ┆ ┆ Servic… ┆ ┆ │\n",
"│ Q ┆ Personal ┆ 97 ┆ Private ┆ 970 ┆ Private ┆ 9700 ┆ Private │\n",
"│ ┆ and Other ┆ ┆ Household ┆ ┆ Household ┆ ┆ Household │\n",
"│ ┆ Services ┆ ┆ s ┆ ┆ s ┆ ┆ s │\n",
"│ ┆ ┆ ┆ Employing ┆ ┆ Employing ┆ ┆ Employing │\n",
"│ ┆ ┆ ┆ S… ┆ ┆ S… ┆ ┆ S… │\n",
"└────────────┴────────────┴────────────┴───────────┴───────────┴───────────┴───────────┴───────────┘"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# We can infer the relationship between levels using their row index.\n",
"anzsic93_combined = (\n",
" (\n",
" anzsic93_raw.filter(~pl.col(\"title\").is_null())\n",
" .drop(\"parent\")\n",
" .rename(lambda col: \"class_\" + col)\n",
" )\n",
" .join_asof(\n",
" anzsic93[\"Groups\"],\n",
" left_on=\"class_index\",\n",
" right_on=\"group_index\",\n",
" strategy=\"backward\",\n",
" )\n",
" .join_asof(\n",
" anzsic93[\"Subdivisions\"],\n",
" left_on=\"group_index\",\n",
" right_on=\"subdivision_index\",\n",
" strategy=\"backward\",\n",
" )\n",
" .join_asof(\n",
" anzsic93[\"Divisions\"],\n",
" left_on=\"subdivision_index\",\n",
" right_on=\"division_index\",\n",
" strategy=\"backward\",\n",
" )\n",
" .drop(cs.ends_with(\"_index\"))\n",
" .select(\n",
" cs.starts_with(\"division\"),\n",
" cs.starts_with(\"subdivision\"),\n",
" cs.starts_with(\"group\"),\n",
" cs.starts_with(\"class\"),\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"anzsic93_combined.write_csv(r\"anzsin_1993.csv\", quote_style=\"always\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down

0 comments on commit ee2e463

Please sign in to comment.