Skip to content

Commit

Permalink
feat: data preprocessing and geocoding integration
Browse files Browse the repository at this point in the history
- Import necessary libraries for data processing and geocoding
- Load environment variables from .env file
- Preprocess sales data: convert object columns to float, parse SALE DATE
- Filter out rows with zero SALE PRICE, LAND SQUARE FEET, and GROSS SQUARE FEET
- Integrate Google Maps Geocoding API for geocoding addresses
- Store geocoding results in a dictionary and update JSON file
- Map latitudes and longitudes to the dataframe using the geocoding results
- Save cleaned data to a new CSV file
  • Loading branch information
ASPPIBRA committed May 26, 2024
1 parent 1fe3017 commit 3cfc89e
Showing 1 changed file with 36 additions and 22 deletions.
58 changes: 36 additions & 22 deletions data_treatment.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import json
import os
import pandas as pd
import numpy as np
import requests
from dotenv import load_dotenv

# Carregar variáveis de ambiente do arquivo .env
load_dotenv()

# =================================================================
# Tratamento de dados de vendas
df_data = pd.read_csv("Dataset/nyc-rolling-sales.csv", index_col=0)

Expand All @@ -13,15 +16,17 @@
df_data[object_cols] = df_data[object_cols].astype(np.float)
df_data["SALE DATE"] = pd.to_datetime(df_data["SALE DATE"])

df_data = df_data[df_data["SALE PRICE"] != 0]
df_data = df_data[df_data["LAND SQUARE FEET"] != 0]
df_data = df_data[df_data["GROSS SQUARE FEET"] != 0]
df_data = df_data[(df_data["SALE PRICE"] != 0) & (df_data["LAND SQUARE FEET"] != 0) & (df_data["GROSS SQUARE FEET"] != 0)]

# =================================================================
# Tratamento de dados LATITUDES e LONGITUDES

here_api = open("keys/here_api").read()
dict_address = json.load(open('dict_notes.json'))
# Chave da API do Google Maps
google_api_key = os.getenv("GOOGLE_API_KEY")

# Carregar dicionário de endereços do arquivo JSON
with open('dict_notes.json') as json_file:
dict_address = json.load(json_file)

error = []
c = 0
total = len(df_data["ADDRESS"].unique())
Expand All @@ -30,34 +35,43 @@
try:
if address in dict_address.keys():
continue
URL = "https://geocode.search.hereapi.com/v1/geocode"
location = address + ", NYC"
PARAMS = {'apikey':here_api,'q':location}
r = requests.get(url = URL, params = PARAMS)

# Fazer solicitação HTTP para obter latitude e longitude
URL = "https://maps.googleapis.com/maps/api/geocode/json"
location = address + ", NYC"
PARAMS = {'address': location, 'key': google_api_key}
r = requests.get(url=URL, params=PARAMS)
data = r.json()

lat = data['items'][0]['position']['lat']
long = data['items'][0]['position']['lng']
dict_address[address] = {"latitude": lat, "longitude": long}
with open('dict_notes.json', 'w') as f:
json.dump(dict_address, f)

# Verificar se a resposta possui dados válidos
if 'results' in data and data['results']:
lat = data['results'][0]['geometry']['location']['lat']
lng = data['results'][0]['geometry']['location']['lng']
dict_address[address] = {"latitude": lat, "longitude": lng}
with open('dict_notes.json', 'w') as f:
json.dump(dict_address, f)
else:
error.append(address)
except Exception as e:
print(e)
error += [address]
print("Erro ao processar endereço:", address, "-", e)
error.append(address)

c += 1
print(c, total)

# ===================================
# Tratamento final

dict_address = json.load(open('dict_notes.json'))
# Carregar dicionário de endereços do arquivo JSON novamente
with open('dict_notes.json') as json_file:
dict_address = json.load(json_file)

# LATITUDE AND LONGITUDE
# Extrair latitudes e longitudes do dicionário
dict_lat = {key: value["latitude"] for key, value in dict_address.items()}
dict_long = {key: value["longitude"] for key, value in dict_address.items()}

# Mapear latitudes e longitudes no dataframe
df_data["LATITUDE"] = df_data["ADDRESS"].map(dict_lat)
df_data["LONGITUDE"] = df_data["ADDRESS"].map(dict_long)

# Salvar os dados tratados em um novo arquivo CSV
df_data.to_csv("Dataset/cleaned_data.csv")

0 comments on commit 3cfc89e

Please sign in to comment.