Skip to content

Commit

Permalink
Data Tools - Added find-tables script for data mapping analysis (bcgo…
Browse files Browse the repository at this point in the history
…v#3048)

* Add find-tables data tool for mapping analysis

* Change env sample
  • Loading branch information
leodube-aot authored Oct 30, 2024
1 parent e833c9b commit 0fb2821
Show file tree
Hide file tree
Showing 11 changed files with 283 additions and 0 deletions.
15 changes: 15 additions & 0 deletions data-tool/find-tables/.env.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Colin Database
ORACLE_USER=
ORACLE_PASSWORD=
ORACLE_DB_NAME=
ORACLE_HOST=
ORACLE_PORT=

ORACLE_INSTANT_CLIENT_DIR=

# Run configurations
FILING_TYP_CD="NOALU" # The specific colin filing type you want to create the mapping for.
MAX_FILINGS=100 # The maximum number of filings to use to create the mapping.
VERBOSE=0 # If 1, will print the rows associated with each child table mapping
MAX_MAPPING_DEPTH=2 # The mapping recursion depth level

4 changes: 4 additions & 0 deletions data-tool/find-tables/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.env
.DS_Store
venv/
__pycache__/
14 changes: 14 additions & 0 deletions data-tool/find-tables/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Setup dependencies
setup: clean
test -f venv/bin/activate || python3 -m venv venv ;\
. venv/bin/activate ;\
pip install -Ur requirements.txt

# Clean up project
clean:
rm -rf venv +
find . -name '__pycache__' -exec rm -fr {} +

# Run helper
run:
. venv/bin/activate && python find_tables.py
43 changes: 43 additions & 0 deletions data-tool/find-tables/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Data Analysis - Find Tables Tool

Creates a db table mapping for a specific filing type code.

### Description

This script works by:
1. Finding a list of FILING entries by FILING_TYP_CD. This list is limited using the MAX_FILINGS variable. For these results we only care about the EVENT_IDS.
2. We use these event ids to create sort of a "root" node to build the mapping around. The EVENT table has many foreign keys referencing it from other tables (these can be thought of as child tables).
3. We populate the event child tables list by getting all referencing constraints.
4. For the child tables, we can then recursively populate their child tables, up to the MAX_MAPPING_DEPTH

Notes:
- The SQL query I use on step 4 isn't perfect. It makes the assumption that the fk column name and pk column name are the same. You'll see log entries that say certain columns cannot be found for a specific. For now, you can just check whether any of these tables might contain useful migration data manually.
```
// Example debug log entries:
// This one happens since CORP_NAME has START_EVENT_ID and END_EVENT_ID, but not EVENT_ID.
// The EVENT_ID debug logs aren't really an issue since it refers back to the root event table.
Could not find column EVENT_ID in table CORP_NAME.
// This means that PARTY_TYP_CD refers to another table, but none of the entries relating to
// the filing type have a value (ie. only null values), so we can ignore this fk.
Only NULL values for PARTY_TYP_CD in table FILING_USER.
```

### Usage

1. Create a new `.env` file from the sample file. Enter the `COLIN DEV` readonly connection details, and the mapping configs.
2. Run `make setup`
3. Run `make run`


### Screenshots
<img src="screenshots/Screenshot%201.png" alt="drawing" width="600"/>

---

<img src="screenshots/Screenshot%202.png" alt="drawing" width="900"/>

---

<img src="screenshots/Screenshot%203.png" alt="drawing" width="600"/>
22 changes: 22 additions & 0 deletions data-tool/find-tables/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
"""All of the configuration for the service is captured here."""
import os
from dotenv import find_dotenv, load_dotenv

# Load all the envars from a .env file located in the project root
load_dotenv(find_dotenv())

class Config():
"""Base configuration class"""
# Colin connection
ORACLE_USER = os.getenv('ORACLE_USER', '')
ORACLE_PASSWORD = os.getenv('ORACLE_PASSWORD', '')
ORACLE_DB_NAME = os.getenv('ORACLE_DB_NAME', '')
ORACLE_HOST = os.getenv('ORACLE_HOST', '')
ORACLE_PORT = os.getenv('ORACLE_PORT', '')
ORACLE_INSTANT_CLIENT_DIR = os.getenv('ORACLE_INSTANT_CLIENT_DIR', '')

# Mapping configs
FILING_TYP_CD = os.getenv('FILING_TYP_CD', 'NOALU')
MAX_FILINGS = int(os.getenv('MAX_FILINGS', 100))
VERBOSE = int(os.getenv('VERBOSE', 0))
MAX_MAPPING_DEPTH = int(os.getenv('MAX_MAPPING_DEPTH', 1))
45 changes: 45 additions & 0 deletions data-tool/find-tables/find_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Get tables with non-null foreign keys for a given COLIN filing event"""

import cx_Oracle

from config import Config
from models import EventTable


# Setup Oracle
dsn = f"{Config.ORACLE_HOST}:{Config.ORACLE_PORT}/{Config.ORACLE_DB_NAME}"
cx_Oracle.init_oracle_client(
lib_dir=Config.ORACLE_INSTANT_CLIENT_DIR
) # this line may not be required for some
connection = cx_Oracle.connect(
user=Config.ORACLE_USER, password=Config.ORACLE_PASSWORD, dsn=dsn
)
cursor = connection.cursor()


# Check connection
cursor.execute(
"SELECT filing_typ_cd, full_desc FROM filing_type WHERE filing_typ_cd=:filing_typ_cd",
filing_typ_cd=Config.FILING_TYP_CD,
)
filing_typ_cd, full_desc = cursor.fetchone()


# Get filings with filing type code
cursor.execute(
"SELECT event_id FROM filing WHERE filing_typ_cd=:filing_typ_cd FETCH FIRST :limit ROWS ONLY",
filing_typ_cd=filing_typ_cd,
limit=Config.MAX_FILINGS,
)
filing_event_ids = [event_id[0] for event_id in cursor.fetchall()]


# Get foreign keys for EVENT table
events = EventTable(filing_type_code=filing_typ_cd, event_ids=filing_event_ids)
events.build_mapping(cursor)

print(f"\n\nTable mapping for {filing_typ_cd}: {full_desc}", end="\n\n")
events.print()

# Close cursor
cursor.close()
138 changes: 138 additions & 0 deletions data-tool/find-tables/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
from dataclasses import dataclass, field
from pprint import pformat
from textwrap import indent

from config import Config


@dataclass
class EventTable:
filing_type_code: str
event_ids: list
child_tables: list = None

def print(self):
"""Prints the mapping"""
print("EVENT_IDs used to build mapping:")
print(self.event_ids, end="\n\n")
print(f"MAPPING (depth {Config.MAX_MAPPING_DEPTH}):")
print(f"EVENT ({len(self.event_ids)} entries)")
for child_table in self.child_tables:
child_table.print()

def build_mapping(self, cursor):
"""Get all child tables that reference the event table via foreign keys"""
self.child_tables = []

cursor.execute(
"""
SELECT table_name, column_name
FROM all_cons_columns
WHERE constraint_name IN (
SELECT constraint_name
FROM all_constraints
WHERE r_constraint_name IN (
SELECT constraint_name
FROM all_constraints
WHERE table_name=:table_name
)
)
""",
table_name="EVENT",
)

# Only get tables that have fk values in filing_ids_with_typ_cd
for table_name, column_name in cursor.fetchall():
stmt = (
f"SELECT * FROM {table_name} WHERE {column_name} IN ("
+ ",".join(str(eid) for eid in self.event_ids)
+ ")"
)
cursor.execute(stmt)
columns = [col[0] for col in cursor.description]
cursor.rowfactory = lambda *args: dict(zip(columns, args))
results = cursor.fetchall()
if len(results) > 0:
child_table = ChildTable(
table_name=table_name,
fk_column_to_parent=column_name,
rows=results,
)
child_table.recursive_set_child_tables(cursor)
self.child_tables.append(child_table)


@dataclass
class ChildTable:
table_name: str
fk_column_to_parent: str
rows: list
num_rows: int = field(init=False)
depth: int = 1
child_tables: list = field(init=False)

def __post_init__(self):
self.num_rows = len(self.rows)
self.child_tables = []

def print(self):
"""Prints the mapping"""
pad = "\t" * self.depth
print(
f"{pad}{self.table_name} ({self.num_rows} entries) on {self.fk_column_to_parent}"
)
if Config.VERBOSE and self.table_name != "EVENT":
print(indent(pformat(self.rows, compact=True), pad))
for child_table in self.child_tables:
child_table.print()

def recursive_set_child_tables(self, cursor):
"""Get all tables that this table references via foreign keys"""
if self.depth >= Config.MAX_MAPPING_DEPTH:
return

cursor.execute(
"""
SELECT table_name, column_name
FROM all_cons_columns
WHERE constraint_name IN (
SELECT c_pk.constraint_name
FROM all_cons_columns a
JOIN all_constraints c ON a.owner=c.owner AND a.constraint_name=c.constraint_name
JOIN all_constraints c_pk ON c.r_owner=c_pk.owner AND c.r_constraint_name=c_pk.constraint_name
WHERE c.constraint_type='R' AND a.table_name=:table_name
)
""",
table_name=self.table_name,
)

# Get only foreign keys with ids in fk_ids array
for table_name, column_name in cursor.fetchall():
if column_name not in self.rows[0]:
print(
f"Could not find column {column_name} in table {self.table_name}."
)
continue

column_vals = set(r[column_name] for r in self.rows)
in_stmt = ",".join(f"'{cv}'" for cv in column_vals if cv is not None)
if not in_stmt:
print(f"Only NULL values for {column_name} in table {self.table_name}.")
continue

stmt = (
f"SELECT * FROM {table_name} WHERE {column_name} IN (" + in_stmt + ")"
)
cursor.execute(stmt)
columns = [col[0] for col in cursor.description]
cursor.rowfactory = lambda *args: dict(zip(columns, args))
results = cursor.fetchall()
if len(results) > 0:
child_table = ChildTable(
table_name=table_name,
fk_column_to_parent=column_name,
rows=results,
depth=self.depth + 1,
)
child_table.recursive_set_child_tables(cursor)
self.child_tables.append(child_table)
2 changes: 2 additions & 0 deletions data-tool/find-tables/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
cx_Oracle==8.3.0
python-dotenv==1.0.1
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 0fb2821

Please sign in to comment.