Skip to content

Commit

Permalink
feat: add br_mdr_snis data wrangling code
Browse files Browse the repository at this point in the history
  • Loading branch information
folhesgabriel committed Jul 1, 2024
1 parent 4195236 commit 237588d
Show file tree
Hide file tree
Showing 2 changed files with 461 additions and 0 deletions.
360 changes: 360 additions & 0 deletions models/br_mdr_snis/code/snis.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,360 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"#from code.utils import *\n",
"import pandas as pd\n",
"from io import StringIO\n",
"import os\n",
"import requests\n",
"import requests\n",
"from bs4 import BeautifulSoup as soup\n",
"from typing import List\n",
"from typing import Dict\n",
"import numpy as np\n",
"import unicodedata\n",
"import basedosdados as bd\n"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from io import StringIO\n",
"import requests\n",
"import unicodedata\n",
"from typing import Dict\n",
"\n",
"\n",
"#---- mudar nome das colunas ----#\n",
"\n",
"def change_column_name(\n",
" url_architecture: str\n",
")-> Dict[str, str]:\n",
" \n",
" \"\"\"Essa função recebe como input uma string com link para uma tabela de arquitetura\n",
" e retorna um dicionário com os nomes das colunas originais e os nomes das colunas \n",
" padronizados\n",
" Returns:\n",
" dict: com chaves sendo os nomes originais e valores sendo os nomes padronizados\n",
" \"\"\"\n",
" # Converte a URL de edição para um link de exportação em formato csv\n",
" url = url_architecture.replace(\n",
" \"edit#gid=\",\n",
" \"export?format=csv&gid=\"\n",
" )\n",
" \n",
" Coloca a arquitetura em um dataframe\n",
" df_architecture = pd.read_csv(\n",
" StringIO(requests.get(url, timeout=10).content.decode(\"utf-8\"))\n",
" )\n",
" \n",
" # Cria um dicionário de nomes de colunas e tipos de dados a partir do dataframe df_architecture\n",
" column_name_dict = dict(\n",
" zip(df_architecture['original_name'],df_architecture['name'])\n",
" )\n",
" \n",
" # Retorna o dicionário \n",
"\n",
" return column_name_dict\n",
"\n",
"#---- mudar tipos de dados ----#\n",
"\n",
"def change_dtypes(\n",
" url_architecture: str\n",
")-> Dict[str, str]:\n",
" \n",
" \"\"\"Essa função recebe como input uma string com link para uma tabela de arquitetura\n",
" e retorna um dicionário com os nomes das colunas originais e os nomes das colunas \n",
" padronizados\n",
" Returns:\n",
" dict: com chaves sendo os nomes originais e valores sendo os nomes padronizados\n",
" \"\"\"\n",
" # Converte a URL de edição para um link de exportação em formato csv\n",
" url = url_architecture.replace(\n",
" \"edit#gid=\",\n",
" \"export?format=csv&gid=\"\n",
" )\n",
" \n",
" Coloca a arquitetura em um dataframe\n",
" df_architecture = pd.read_csv(\n",
" StringIO(requests.get(url, timeout=10).content.decode(\"utf-8\"))\n",
" )\n",
" \n",
"\n",
" # Cria um dicionário de nomes de colunas e tipos de dados a partir do dataframe df_architecture\n",
" column_name_dict = dict(\n",
" zip(df_architecture['original_name'],df_architecture['bigquery_type'])\n",
" )\n",
"\n",
" #O pandas não consegue ler ints que tenham NAs\n",
" #Para contornar isso e não adicionar 0. ao final de cada número,\n",
" #optei por converter todos os inteiros para string\n",
" \n",
" #loop para padronizar os tipos de dados e converter in para string\n",
" for key, value in column_name_dict.items():\n",
" if value == 'string':\n",
" column_name_dict[key] = str\n",
" elif value == 'int64':\n",
" column_name_dict[key] = str\n",
" elif value == 'float64':\n",
" column_name_dict[key] = float\n",
"\n",
" return column_name_dict\n",
"\n",
"\n",
"#---- remover acentos e caracteres especiais ----#\n",
"def remove_accents(\n",
" input_str\n",
"):\n",
" \"\"\"Essa função é aplicada com método apply em uma coluna de um dataframe para remover\n",
" acentos e caracteres especiais de uma string. Exemplo de uso\n",
"\n",
" df[x].apply(remove_accents)\n",
"\n",
" Args:\n",
" input_str (pd.Series): Uma coluna com strings\n",
"\n",
" Returns:\n",
" pd.Series : coluna com strings sema acentos e caracteres especiais\n",
" \"\"\" \n",
" nfkd_form = unicodedata.normalize('NFKD', input_str)\n",
" \n",
" return u\"\".join([c for c in nfkd_form if not unicodedata.combining(c)])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#municipio\n",
"dir_path = r'\\br_mdr_snis\\input\\municipio'\n",
"files = os.listdir(dir_path)\n",
"\n",
"lista_dfs = []\n",
"\n",
"print('baixando dicionario da tabela de arquitetura para renomear colunas')\n",
"dicionario = change_column_name(\n",
" url_architecture = 'https://docs.google.com/spreadsheets/d/1OA4mSAo99kY7vN4hf3-n22fAhbcXMPuyF1uxGBIo3UE/edit?gid=0#gid=0'\n",
")\n",
"\n",
"print('baixando tipos de dados da tabela de arquitetura para importar dados')\n",
"dicionario_dtypes = change_dtypes(\n",
" url_architecture='https://docs.google.com/spreadsheets/d/1OA4mSAo99kY7vN4hf3-n22fAhbcXMPuyF1uxGBIo3UE/edit?gid=0#gid=0'\n",
")\n",
"\n",
"print('baixando diretorios de municípios da BD')\n",
"municipio = bd.read_table(\n",
" dataset_id= 'br_bd_diretorios_brasil',\n",
" table_id= 'municipio',\n",
" billing_project_id= \"pisagab-staging\"\n",
" )\n",
"municipio_dict = dict(zip(municipio.id_municipio_6, municipio.id_municipio))\n",
"\n",
"\n",
"for file in files:\n",
" \n",
" print(f'lendo arquivo {file}')\n",
" df = pd.read_csv(\n",
" dir_path + '\\\\' + file,\n",
" sep=';',\n",
" encoding= 'latin-1',\n",
" decimal=',',\n",
" thousands='.',\n",
" dtype= dicionario_dtypes,\n",
" )\n",
"\n",
" print('renomeando colunas')\n",
" #rename\n",
" df.rename(\n",
" columns=dicionario, \n",
" inplace=True\n",
" )\n",
"\n",
" print('adicionando id_municipio 7 dígitos')\n",
" #add id_municipio 7 digits\n",
" df['id_municipio'] = df['id_municipio'].map(municipio_dict)\n",
" \n",
" #check if there is any null value\n",
" if df['id_municipio'].isna().sum() != 0:\n",
" raise ValueError('id_municipio com valores nulos')\n",
" \n",
" \n",
" print('dropando colunas')\n",
" #delete cols\n",
" df.drop(\n",
" columns=[\n",
" 'Código do IBGE', \n",
" 'Município',\n",
" 'Prestadores',\n",
" 'Serviços',\n",
" 'Natureza Jurídica',\n",
" ], \n",
" inplace=True)\n",
" \n",
" print('tirando . de ints por precaução')\n",
" \n",
" strings_to_int = df.select_dtypes(include=['object']).columns\n",
" \n",
" for col in strings_to_int:\n",
" df[col] = df[col].str.replace('.', '')\n",
" print(f'col {col} formatada, . retirados')\n",
" \n",
" \n",
" print('ordenando colunas')\n",
" #redorder\n",
" df = df[dicionario.values()]\n",
"\n",
" lista_dfs.append(df)\n",
"\n",
"df = pd.concat(lista_dfs)\n",
"\n",
"df.to_csv(\n",
" r'\\br_mdr_snis\\output\\municipio.csv',\n",
" index=False,\n",
" sep=',',\n",
" encoding='utf-8',\n",
" na_rep='',\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#prestador\n",
"dir_path = r'\\br_mdr_snis\\input\\prestador'\n",
"files = os.listdir(dir_path)\n",
"\n",
"lista_dfs = []\n",
"\n",
"print('baixando dicionario da tabela de arquitetura para renomear colunas')\n",
"dicionario = change_column_name(\n",
" url_architecture = 'https://docs.google.com/spreadsheets/d/1spjI65YVI17mxtC1tmXaqMGk2pKAUPBJ5AqENKwB1Ys/edit?gid=0#gid=0'\n",
")\n",
"\n",
"print('baixando tipos de dados da tabela de arquitetura para importar dados')\n",
"dicionario_dtypes = change_dtypes(\n",
" url_architecture='https://docs.google.com/spreadsheets/d/1spjI65YVI17mxtC1tmXaqMGk2pKAUPBJ5AqENKwB1Ys/edit?gid=0#gid=0'\n",
")\n",
"\n",
"print('baixando diretorios de municípios da BD')\n",
"municipio = bd.read_table(\n",
" dataset_id= 'br_bd_diretorios_brasil',\n",
" table_id= 'municipio',\n",
" billing_project_id= \"pisagab-staging\"\n",
" )\n",
"\n",
"municipio_dict = dict(zip(municipio.id_municipio_6, municipio.id_municipio))\n",
"\n",
"\n",
"for file in files:\n",
" \n",
" print(f'lendo arquivo {file}')\n",
" df = pd.read_csv(\n",
" dir_path + '\\\\' + file,\n",
" sep=';',\n",
" encoding= 'latin-1',\n",
" decimal=',',\n",
" thousands='.',\n",
" dtype= dicionario_dtypes,\n",
" )\n",
"\n",
" print('renomeando colunas')\n",
" #rename\n",
" df.rename(\n",
" columns=dicionario, \n",
" inplace=True\n",
" )\n",
"\n",
" print('adicionando id_municipio 7 dígitos')\n",
" #add id_municipio 7 digits\n",
" df['id_municipio'] = df['id_municipio'].map(municipio_dict)\n",
" \n",
" #check if there is any null value\n",
" if df['id_municipio'].isna().sum() != 0:\n",
" raise ValueError('id_municipio com valores nulos')\n",
" \n",
" print('dropando colunas')\n",
" #delete cols\n",
" df.drop(\n",
" columns=[\n",
" 'Município',\n",
" ], \n",
" inplace=True)\n",
" \n",
" print('tirando . de ints por precaução')\n",
" strings_to_int = df.select_dtypes(include=['object']).columns\n",
" \n",
" for col in strings_to_int:\n",
" df[col] = df[col].str.replace('.', '')\n",
" print(f'col {col} formatada, . retirados')\n",
" \n",
" \n",
" print('removendo acentos')\n",
" latin_list = [\n",
" 'prestador',\n",
" 'natureza_juridica',\n",
" 'tipo_servico',\n",
" 'local_atendimento_agua',\n",
" 'local_atendimento_esgoto',\n",
" ]\n",
"\n",
" for col in latin_list:\n",
" df[col] = df[col].apply(remove_accents)\n",
"\n",
"\n",
" #insert id_natureza_juridica\n",
"\n",
" print('ordenando colunas')\n",
" #redorder\n",
" df = df[dicionario.values()]\n",
"\n",
" lista_dfs.append(df)\n",
"\n",
"df = pd.concat(lista_dfs)\n",
"\n",
"df.to_csv(\n",
" r'\\br_mdr_snis\\output\\prestador.csv',\n",
" index=False,\n",
" sep=',',\n",
" encoding='utf-8',\n",
" na_rep='',\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "vm_basedosdados",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 237588d

Please sign in to comment.