Skip to content

Commit

Permalink
- Moved logger to its own file.
Browse files Browse the repository at this point in the history
- Changed it from middleware to a regular function that needs to
  be called by any route handlers that want to use it. It adds seven
  lines to those functions, but the logger code is much easier to deal
  with now.
- Added that code to every almost route called by the front on the
  search and comparison pages.
- It finally logs useful information when errors occur
- Had to make return_err_with_trace async
- Moved redundant config stuff to backend/config
  • Loading branch information
Sigfried committed Nov 2, 2023
1 parent 9e5657c commit 01571e5
Show file tree
Hide file tree
Showing 13 changed files with 425 additions and 340 deletions.
153 changes: 153 additions & 0 deletions backend/api_logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import datetime
import os
import re
import httpx
import time
from _socket import gethostname
from typing import Dict, List, Optional

import pytz
from starlette.requests import Request

from backend.config import get_schema_name
from backend.db.utils import get_db_connection, insert_from_dict, run_sql
from backend.utils import dump


class Api_logger:
def __init__(self):
pass

async def start_rpt(self, request: Request, params: Dict):
self.start_time = time.time()
rpt = {}
url = request.url
api_call = url.components[2][1:] # string leading /
rpt['api_call'] = api_call

eastern = pytz.timezone('US/Eastern')
rpt['timestamp'] = datetime.datetime.now(eastern).isoformat()

rpt['host'] = os.getenv('HOSTENV', gethostname())

rpt['client'] = await client_location(request)

rpt['schema'] = get_schema_name()

rpt_params = {}
for k,v in params.items():
if type(v) == list:
if len(v) > 20:
# change any params with len > 20 to just log the len
rpt_params[k + '_len'] = len(v)
elif k == 'codeset_ids' or k == 'id':
# put codeset_ids in a separate column (is this going to be helpful?)
codeset_ids = v

if len(v) == 1 and type(codeset_ids[0]) == str:
codeset_ids = codeset_ids[0].split('|')

codeset_ids = [int(x) for x in codeset_ids]
rpt['codeset_ids'] = codeset_ids
else:
rpt_params[k] = v
else:
raise(Exception(f"don't know how to log {k}: {dump(v)}"))

# everything but codeset_ids just gets dumped into the rpt
params_list = []
for k,v in rpt_params.items():
params_list.append(f'{k}: {v}')

rpt['params'] = '; '.join(params_list)
self.rpt = rpt
with get_db_connection() as con:
insert_from_dict(con, 'public.api_runs', rpt, skip_if_already_exists=False)


async def finish(self, rows: List = []):
if rows:
self.rpt['result'] = f'{rows} rows'
else:
self.rpt['result'] = 'Success'

await self.complete_log_record()


async def complete_log_record(self):
end_time = time.time()
process_seconds = end_time - self.start_time
self.rpt['process_seconds'] = process_seconds

with get_db_connection() as con:
run_sql(con, """
UPDATE public.api_runs
SET process_seconds = :process_seconds, result = :result
WHERE timestamp = :timestamp""", self.rpt)
# using timestamp as a primary key. not the best practice, I know, but with microsecond granularity
# (e.g., 2023-10-31T13:32:23.934211), it seems like it should be safe


async def log_error(self, e):
self.rpt['result'] = f'Error: {e}'
await self.complete_log_record()

async def client_location(request: Request) -> str:
# rpt['client'] = request.client.host -- this gives a local (169.154) IP on azure
# chatgpt recommends:
forwarded_for: Optional[str] = request.headers.get('X-Forwarded-For')
if forwarded_for:
# The header can contain multiple IP addresses, so take the first one
ip = forwarded_for.split(',')[0]
else:
ip = request.client.host

ip = re.sub(':.*', '', ip)

ipstack_key = os.getenv('API_STACK_KEY', None)

if ip != '127.0.0.1' and ipstack_key:
"""
http://api.ipstack.com/134.201.250.155?access_key=7a6f9d6d72d68a1452b643eb58cd8ee7&format=1
{
"ip": "134.201.250.155",
"type": "ipv4",
"continent_code": "NA",
"continent_name": "North America",
"country_code": "US",
"country_name": "United States",
"region_code": "CA",
"region_name": "California",
"city": "San Fernando",
"zip": "91344",
"latitude": 34.293949127197266,
"longitude": -118.50763702392578,
"location": {
"geoname_id": 5391945,
"capital": "Washington D.C.",
"languages": [
{
"code": "en",
"name": "English",
"native": "English"
}
],
"country_flag": "https://assets.ipstack.com/flags/us.svg",
"country_flag_emoji": "🇺🇸",
"country_flag_emoji_unicode": "U+1F1FA U+1F1F8",
"calling_code": "1",
"is_eu": false
}
}
"""

loc_url = f"http://api.ipstack.com/{ip}?access_key={ipstack_key}"

async with httpx.AsyncClient() as client:
response = await client.get(loc_url)
if response and response.json:
loc_obj = response.json()
location = f"{ip}: {loc_obj['city']}, {loc_obj['region_name']}"
return location

return ip
152 changes: 9 additions & 143 deletions backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,18 @@
Resources
- https://github.com/tiangolo/fastapi
"""
import os
from typing import List, Optional
import re

import uvicorn
from fastapi import FastAPI, Request
from fastapi.middleware.cors import CORSMiddleware
from fastapi.middleware.gzip import GZipMiddleware
import httpx
# from starlette.requests import Request
import time
import datetime
from socket import gethostname

import backend.config
from backend.routes import cset_crud, db, graph
from backend.db.config import override_schema, get_schema_name
from backend.db.utils import insert_from_dict, get_db_connection, run_sql
from backend.config import override_schema

# users on the same server
APP = FastAPI()
# APP = FastAPI()
APP = FastAPI(client_max_size=100_000_000) # trying this, but it shouldn't be necessary
APP.include_router(cset_crud.router)
# APP.include_router(oak.router)
APP.include_router(graph.router)
Expand All @@ -37,136 +28,15 @@
APP.add_middleware(GZipMiddleware, minimum_size=1000)


async def client_location(request: Request) -> str:
# rpt['client'] = request.client.host -- this gives a local (169.154) IP on azure
# chatgpt recommends:
forwarded_for: Optional[str] = request.headers.get('X-Forwarded-For')
if forwarded_for:
# The header can contain multiple IP addresses, so take the first one
ip = forwarded_for.split(',')[0]
else:
ip = request.client.host

ip = re.sub(':.*', '', ip)

ipstack_key = os.getenv('API_STACK_KEY', None)

if ip != '127.0.0.1' and ipstack_key:
"""
http://api.ipstack.com/134.201.250.155?access_key=7a6f9d6d72d68a1452b643eb58cd8ee7&format=1
{
"ip": "134.201.250.155",
"type": "ipv4",
"continent_code": "NA",
"continent_name": "North America",
"country_code": "US",
"country_name": "United States",
"region_code": "CA",
"region_name": "California",
"city": "San Fernando",
"zip": "91344",
"latitude": 34.293949127197266,
"longitude": -118.50763702392578,
"location": {
"geoname_id": 5391945,
"capital": "Washington D.C.",
"languages": [
{
"code": "en",
"name": "English",
"native": "English"
}
],
"country_flag": "https://assets.ipstack.com/flags/us.svg",
"country_flag_emoji": "🇺🇸",
"country_flag_emoji_unicode": "U+1F1FA U+1F1F8",
"calling_code": "1",
"is_eu": false
}
}
"""

loc_url = f"http://api.ipstack.com/{ip}?access_key={ipstack_key}"

async with httpx.AsyncClient() as client:
response = await client.get(loc_url)
if response and response.json:
loc_obj = response.json()
location = f"{ip}: {loc_obj['city']}, {loc_obj['region_name']}"
return location

return ip



@APP.middleware("http")
async def set_schema_globally_and_log_calls(request: Request, call_next):
"""
This is middleware and will be EXECUTED ON EVERY API CALL
Its purpose is to log TermHub usage to help us prioritize performance improvements
Also, if a schema is provided, it will be used to override CONFIG['schema']
"""

url = request.url
query_params = request.query_params # Extracting query params as a dict

codeset_ids = query_params.getlist("codeset_ids")
if not codeset_ids:
print(f"No codeset_ids provided, not sure what monitoring to do, if any for {url}")
return await call_next(request)
if len(codeset_ids) == 1 and type(codeset_ids[0]) == str:
codeset_ids = codeset_ids[0].split('|')
codeset_ids = [int(x) for x in codeset_ids]
async def set_schema_globally(request: Request, call_next):
print(request.url)

start_time = time.time()

rpt = {}
rpt['timestamp'] = datetime.datetime.now().isoformat()

rpt['host'] = os.getenv('HOSTENV', gethostname())

rpt['client'] = await client_location(request)

schema = query_params.get("schema")
schema = request.query_params.get("schema")
if schema:
override_schema(schema)

schema = get_schema_name()
rpt['schema'] = schema

api_call = url.components[2][1:] # string leading /
rpt['api_call'] = api_call


if api_call == 'concept-ids-by-codeset-id':
rpt['related_codeset_ids'] = len(codeset_ids)
else:
rpt['codeset_ids'] = codeset_ids

print(f"Request: {request.url} {request.method} {schema} {codeset_ids}")

con = get_db_connection()
insert_from_dict(con, 'public.api_runs', rpt, skip_if_already_exists=False)

try:
response = await call_next(request) # Proceed with the request
rpt['result'] = 'Success'
except Exception as e:
rpt['result'] = f'Error: {e}'

end_time = time.time()
process_seconds = end_time - start_time
rpt['process_seconds'] = process_seconds

run_sql(con, """
UPDATE public.api_runs
SET process_seconds = :process_seconds, result = :result
WHERE timestamp = :timestamp""", rpt)
# using timestamp as a primary key. not the best practice, I know, but with microsecond granularity
# (e.g., 2023-10-31T13:32:23.934211), it seems like it should be safe

response.headers["X-Process-Time"] = str(process_seconds)
response = await call_next(request)
return response


Expand All @@ -182,6 +52,7 @@ def read_root():
url_list = [{"path": route.path, "name": route.name} for route in APP.routes]
return url_list


# CACHE_FILE = "cache.pickle"
#
#
Expand Down Expand Up @@ -227,9 +98,4 @@ def read_root():


if __name__ == '__main__':
run()


def monitor_request(request: Request, codeset_ids: List[int]) -> None:

pass
run()
42 changes: 42 additions & 0 deletions backend/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,45 @@
OUTPUT_DIR = os.path.join(PROJECT_ROOT, 'output')
ENV_FILE = os.path.join(ENV_DIR, '.env')
load_dotenv(ENV_FILE)

DB_DIR = os.path.dirname(os.path.realpath(__file__))
BACKEND_DIR = os.path.join(DB_DIR, '..')
DOCS_DIR = os.path.join(PROJECT_ROOT, 'docs')
TERMHUB_CSETS_PATH = os.path.join(PROJECT_ROOT, 'termhub-csets')
DATASETS_PATH = os.path.join(TERMHUB_CSETS_PATH, 'datasets', 'prepped_files')
OBJECTS_PATH = os.path.join(TERMHUB_CSETS_PATH, 'objects')
DDL_JINJA_PATH_PATTERN = os.path.join(DB_DIR, 'ddl-*.jinja.sql')

CONFIG = {
'server': os.getenv('TERMHUB_DB_SERVER'),
'driver': os.getenv('TERMHUB_DB_DRIVER'),
'host': os.getenv('TERMHUB_DB_HOST'),
'user': os.getenv('TERMHUB_DB_USER'),
'db': os.getenv('TERMHUB_DB_DB'),
'schema': os.getenv('TERMHUB_DB_SCHEMA'),
'pass': os.getenv('TERMHUB_DB_PASS'),
'port': os.getenv('TERMHUB_DB_PORT'),
'personal_access_token': os.getenv('GH_LIMITED_PERSONAL_ACCESS_TOKEN')
}
CONFIG_LOCAL = {
'server': os.getenv('TERMHUB_DB_SERVER_LOCAL'),
'driver': os.getenv('TERMHUB_DB_DRIVER_LOCAL'),
'host': os.getenv('TERMHUB_DB_HOST_LOCAL'),
'user': os.getenv('TERMHUB_DB_USER_LOCAL'),
'db': os.getenv('TERMHUB_DB_DB_LOCAL'),
'schema': os.getenv('TERMHUB_DB_SCHEMA_LOCAL'),
'pass': os.getenv('TERMHUB_DB_PASS_LOCAL'),
'port': os.getenv('TERMHUB_DB_PORT_LOCAL'),
'personal_access_token': os.getenv('GH_LIMITED_PERSONAL_ACCESS_TOKEN')
}


def override_schema(schema: str):
if CONFIG['schema']!= schema:
print(f'Overriding {CONFIG["schema"]} schema to {schema}')
else:
CONFIG['schema'] = schema


def get_schema_name():
return CONFIG['schema']
Loading

0 comments on commit 01571e5

Please sign in to comment.