-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add br_mdr_snis data wrangling code
- Loading branch information
1 parent
4195236
commit 237588d
Showing
2 changed files
with
461 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,360 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#from code.utils import *\n", | ||
"import pandas as pd\n", | ||
"from io import StringIO\n", | ||
"import os\n", | ||
"import requests\n", | ||
"import requests\n", | ||
"from bs4 import BeautifulSoup as soup\n", | ||
"from typing import List\n", | ||
"from typing import Dict\n", | ||
"import numpy as np\n", | ||
"import unicodedata\n", | ||
"import basedosdados as bd\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 56, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"from io import StringIO\n", | ||
"import requests\n", | ||
"import unicodedata\n", | ||
"from typing import Dict\n", | ||
"\n", | ||
"\n", | ||
"#---- mudar nome das colunas ----#\n", | ||
"\n", | ||
"def change_column_name(\n", | ||
" url_architecture: str\n", | ||
")-> Dict[str, str]:\n", | ||
" \n", | ||
" \"\"\"Essa função recebe como input uma string com link para uma tabela de arquitetura\n", | ||
" e retorna um dicionário com os nomes das colunas originais e os nomes das colunas \n", | ||
" padronizados\n", | ||
" Returns:\n", | ||
" dict: com chaves sendo os nomes originais e valores sendo os nomes padronizados\n", | ||
" \"\"\"\n", | ||
" # Converte a URL de edição para um link de exportação em formato csv\n", | ||
" url = url_architecture.replace(\n", | ||
" \"edit#gid=\",\n", | ||
" \"export?format=csv&gid=\"\n", | ||
" )\n", | ||
" \n", | ||
" Coloca a arquitetura em um dataframe\n", | ||
" df_architecture = pd.read_csv(\n", | ||
" StringIO(requests.get(url, timeout=10).content.decode(\"utf-8\"))\n", | ||
" )\n", | ||
" \n", | ||
" # Cria um dicionário de nomes de colunas e tipos de dados a partir do dataframe df_architecture\n", | ||
" column_name_dict = dict(\n", | ||
" zip(df_architecture['original_name'],df_architecture['name'])\n", | ||
" )\n", | ||
" \n", | ||
" # Retorna o dicionário \n", | ||
"\n", | ||
" return column_name_dict\n", | ||
"\n", | ||
"#---- mudar tipos de dados ----#\n", | ||
"\n", | ||
"def change_dtypes(\n", | ||
" url_architecture: str\n", | ||
")-> Dict[str, str]:\n", | ||
" \n", | ||
" \"\"\"Essa função recebe como input uma string com link para uma tabela de arquitetura\n", | ||
" e retorna um dicionário com os nomes das colunas originais e os nomes das colunas \n", | ||
" padronizados\n", | ||
" Returns:\n", | ||
" dict: com chaves sendo os nomes originais e valores sendo os nomes padronizados\n", | ||
" \"\"\"\n", | ||
" # Converte a URL de edição para um link de exportação em formato csv\n", | ||
" url = url_architecture.replace(\n", | ||
" \"edit#gid=\",\n", | ||
" \"export?format=csv&gid=\"\n", | ||
" )\n", | ||
" \n", | ||
" Coloca a arquitetura em um dataframe\n", | ||
" df_architecture = pd.read_csv(\n", | ||
" StringIO(requests.get(url, timeout=10).content.decode(\"utf-8\"))\n", | ||
" )\n", | ||
" \n", | ||
"\n", | ||
" # Cria um dicionário de nomes de colunas e tipos de dados a partir do dataframe df_architecture\n", | ||
" column_name_dict = dict(\n", | ||
" zip(df_architecture['original_name'],df_architecture['bigquery_type'])\n", | ||
" )\n", | ||
"\n", | ||
" #O pandas não consegue ler ints que tenham NAs\n", | ||
" #Para contornar isso e não adicionar 0. ao final de cada número,\n", | ||
" #optei por converter todos os inteiros para string\n", | ||
" \n", | ||
" #loop para padronizar os tipos de dados e converter in para string\n", | ||
" for key, value in column_name_dict.items():\n", | ||
" if value == 'string':\n", | ||
" column_name_dict[key] = str\n", | ||
" elif value == 'int64':\n", | ||
" column_name_dict[key] = str\n", | ||
" elif value == 'float64':\n", | ||
" column_name_dict[key] = float\n", | ||
"\n", | ||
" return column_name_dict\n", | ||
"\n", | ||
"\n", | ||
"#---- remover acentos e caracteres especiais ----#\n", | ||
"def remove_accents(\n", | ||
" input_str\n", | ||
"):\n", | ||
" \"\"\"Essa função é aplicada com método apply em uma coluna de um dataframe para remover\n", | ||
" acentos e caracteres especiais de uma string. Exemplo de uso\n", | ||
"\n", | ||
" df[x].apply(remove_accents)\n", | ||
"\n", | ||
" Args:\n", | ||
" input_str (pd.Series): Uma coluna com strings\n", | ||
"\n", | ||
" Returns:\n", | ||
" pd.Series : coluna com strings sema acentos e caracteres especiais\n", | ||
" \"\"\" \n", | ||
" nfkd_form = unicodedata.normalize('NFKD', input_str)\n", | ||
" \n", | ||
" return u\"\".join([c for c in nfkd_form if not unicodedata.combining(c)])\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#municipio\n", | ||
"dir_path = r'\\br_mdr_snis\\input\\municipio'\n", | ||
"files = os.listdir(dir_path)\n", | ||
"\n", | ||
"lista_dfs = []\n", | ||
"\n", | ||
"print('baixando dicionario da tabela de arquitetura para renomear colunas')\n", | ||
"dicionario = change_column_name(\n", | ||
" url_architecture = 'https://docs.google.com/spreadsheets/d/1OA4mSAo99kY7vN4hf3-n22fAhbcXMPuyF1uxGBIo3UE/edit?gid=0#gid=0'\n", | ||
")\n", | ||
"\n", | ||
"print('baixando tipos de dados da tabela de arquitetura para importar dados')\n", | ||
"dicionario_dtypes = change_dtypes(\n", | ||
" url_architecture='https://docs.google.com/spreadsheets/d/1OA4mSAo99kY7vN4hf3-n22fAhbcXMPuyF1uxGBIo3UE/edit?gid=0#gid=0'\n", | ||
")\n", | ||
"\n", | ||
"print('baixando diretorios de municípios da BD')\n", | ||
"municipio = bd.read_table(\n", | ||
" dataset_id= 'br_bd_diretorios_brasil',\n", | ||
" table_id= 'municipio',\n", | ||
" billing_project_id= \"pisagab-staging\"\n", | ||
" )\n", | ||
"municipio_dict = dict(zip(municipio.id_municipio_6, municipio.id_municipio))\n", | ||
"\n", | ||
"\n", | ||
"for file in files:\n", | ||
" \n", | ||
" print(f'lendo arquivo {file}')\n", | ||
" df = pd.read_csv(\n", | ||
" dir_path + '\\\\' + file,\n", | ||
" sep=';',\n", | ||
" encoding= 'latin-1',\n", | ||
" decimal=',',\n", | ||
" thousands='.',\n", | ||
" dtype= dicionario_dtypes,\n", | ||
" )\n", | ||
"\n", | ||
" print('renomeando colunas')\n", | ||
" #rename\n", | ||
" df.rename(\n", | ||
" columns=dicionario, \n", | ||
" inplace=True\n", | ||
" )\n", | ||
"\n", | ||
" print('adicionando id_municipio 7 dígitos')\n", | ||
" #add id_municipio 7 digits\n", | ||
" df['id_municipio'] = df['id_municipio'].map(municipio_dict)\n", | ||
" \n", | ||
" #check if there is any null value\n", | ||
" if df['id_municipio'].isna().sum() != 0:\n", | ||
" raise ValueError('id_municipio com valores nulos')\n", | ||
" \n", | ||
" \n", | ||
" print('dropando colunas')\n", | ||
" #delete cols\n", | ||
" df.drop(\n", | ||
" columns=[\n", | ||
" 'Código do IBGE', \n", | ||
" 'Município',\n", | ||
" 'Prestadores',\n", | ||
" 'Serviços',\n", | ||
" 'Natureza Jurídica',\n", | ||
" ], \n", | ||
" inplace=True)\n", | ||
" \n", | ||
" print('tirando . de ints por precaução')\n", | ||
" \n", | ||
" strings_to_int = df.select_dtypes(include=['object']).columns\n", | ||
" \n", | ||
" for col in strings_to_int:\n", | ||
" df[col] = df[col].str.replace('.', '')\n", | ||
" print(f'col {col} formatada, . retirados')\n", | ||
" \n", | ||
" \n", | ||
" print('ordenando colunas')\n", | ||
" #redorder\n", | ||
" df = df[dicionario.values()]\n", | ||
"\n", | ||
" lista_dfs.append(df)\n", | ||
"\n", | ||
"df = pd.concat(lista_dfs)\n", | ||
"\n", | ||
"df.to_csv(\n", | ||
" r'\\br_mdr_snis\\output\\municipio.csv',\n", | ||
" index=False,\n", | ||
" sep=',',\n", | ||
" encoding='utf-8',\n", | ||
" na_rep='',\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#prestador\n", | ||
"dir_path = r'\\br_mdr_snis\\input\\prestador'\n", | ||
"files = os.listdir(dir_path)\n", | ||
"\n", | ||
"lista_dfs = []\n", | ||
"\n", | ||
"print('baixando dicionario da tabela de arquitetura para renomear colunas')\n", | ||
"dicionario = change_column_name(\n", | ||
" url_architecture = 'https://docs.google.com/spreadsheets/d/1spjI65YVI17mxtC1tmXaqMGk2pKAUPBJ5AqENKwB1Ys/edit?gid=0#gid=0'\n", | ||
")\n", | ||
"\n", | ||
"print('baixando tipos de dados da tabela de arquitetura para importar dados')\n", | ||
"dicionario_dtypes = change_dtypes(\n", | ||
" url_architecture='https://docs.google.com/spreadsheets/d/1spjI65YVI17mxtC1tmXaqMGk2pKAUPBJ5AqENKwB1Ys/edit?gid=0#gid=0'\n", | ||
")\n", | ||
"\n", | ||
"print('baixando diretorios de municípios da BD')\n", | ||
"municipio = bd.read_table(\n", | ||
" dataset_id= 'br_bd_diretorios_brasil',\n", | ||
" table_id= 'municipio',\n", | ||
" billing_project_id= \"pisagab-staging\"\n", | ||
" )\n", | ||
"\n", | ||
"municipio_dict = dict(zip(municipio.id_municipio_6, municipio.id_municipio))\n", | ||
"\n", | ||
"\n", | ||
"for file in files:\n", | ||
" \n", | ||
" print(f'lendo arquivo {file}')\n", | ||
" df = pd.read_csv(\n", | ||
" dir_path + '\\\\' + file,\n", | ||
" sep=';',\n", | ||
" encoding= 'latin-1',\n", | ||
" decimal=',',\n", | ||
" thousands='.',\n", | ||
" dtype= dicionario_dtypes,\n", | ||
" )\n", | ||
"\n", | ||
" print('renomeando colunas')\n", | ||
" #rename\n", | ||
" df.rename(\n", | ||
" columns=dicionario, \n", | ||
" inplace=True\n", | ||
" )\n", | ||
"\n", | ||
" print('adicionando id_municipio 7 dígitos')\n", | ||
" #add id_municipio 7 digits\n", | ||
" df['id_municipio'] = df['id_municipio'].map(municipio_dict)\n", | ||
" \n", | ||
" #check if there is any null value\n", | ||
" if df['id_municipio'].isna().sum() != 0:\n", | ||
" raise ValueError('id_municipio com valores nulos')\n", | ||
" \n", | ||
" print('dropando colunas')\n", | ||
" #delete cols\n", | ||
" df.drop(\n", | ||
" columns=[\n", | ||
" 'Município',\n", | ||
" ], \n", | ||
" inplace=True)\n", | ||
" \n", | ||
" print('tirando . de ints por precaução')\n", | ||
" strings_to_int = df.select_dtypes(include=['object']).columns\n", | ||
" \n", | ||
" for col in strings_to_int:\n", | ||
" df[col] = df[col].str.replace('.', '')\n", | ||
" print(f'col {col} formatada, . retirados')\n", | ||
" \n", | ||
" \n", | ||
" print('removendo acentos')\n", | ||
" latin_list = [\n", | ||
" 'prestador',\n", | ||
" 'natureza_juridica',\n", | ||
" 'tipo_servico',\n", | ||
" 'local_atendimento_agua',\n", | ||
" 'local_atendimento_esgoto',\n", | ||
" ]\n", | ||
"\n", | ||
" for col in latin_list:\n", | ||
" df[col] = df[col].apply(remove_accents)\n", | ||
"\n", | ||
"\n", | ||
" #insert id_natureza_juridica\n", | ||
"\n", | ||
" print('ordenando colunas')\n", | ||
" #redorder\n", | ||
" df = df[dicionario.values()]\n", | ||
"\n", | ||
" lista_dfs.append(df)\n", | ||
"\n", | ||
"df = pd.concat(lista_dfs)\n", | ||
"\n", | ||
"df.to_csv(\n", | ||
" r'\\br_mdr_snis\\output\\prestador.csv',\n", | ||
" index=False,\n", | ||
" sep=',',\n", | ||
" encoding='utf-8',\n", | ||
" na_rep='',\n", | ||
")" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "vm_basedosdados", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.12" | ||
}, | ||
"orig_nbformat": 4 | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.