Skip to content

Commit

Permalink
Merge pull request #173 from georgetown-cset/120-permid-sector
Browse files Browse the repository at this point in the history
Add permid sectors
  • Loading branch information
jmelot authored Nov 27, 2023
2 parents b0c901f + 1bdef29 commit 87b54c8
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 10 deletions.
2 changes: 1 addition & 1 deletion web/gui-v2/src/static_data/data.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion web/gui-v2/src/static_data/overall_data.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions web/raw_data/sectors.jsonl

Large diffs are not rendered by default.

70 changes: 64 additions & 6 deletions web/scripts/retrieve_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
ORIG_NAMES_FI = os.path.join(RAW_DATA_DIR, "company_names.jsonl")
# Cache of links to Google Finance page
EXCHANGE_LINK_FI = os.path.join(RAW_DATA_DIR, "exchange_links.jsonl")
# Cache of PERMID sectors
SECTOR_FI = os.path.join(RAW_DATA_DIR, "sectors.jsonl")
# Download of https://docs.google.com/spreadsheets/d/1OpZGUG9y0onZfRfx9aVgRWiYSFWJ-TRsDQwYtcniCB0/edit#gid=468518268
# containing student-retrieved company descriptions to supplement crunchbase
SUPPLEMENTAL_DESCRIPTIONS = os.path.join(RAW_DATA_DIR, "supplemental_company_descriptions.csv")
Expand Down Expand Up @@ -105,6 +107,33 @@ def get_exchange_link(market_key: str) -> dict:
return {"market_key": market_key, "link": gf_link}


def get_permid_sector(permids: list) -> tuple:
"""
Get first permid sector available in permids for a PARAT company
:param permids: List of permids a company may have
:return: First economic and business sector from PERMID, or "Unknown" if we couldn't find one
"""
access_token = os.environ.get("PERMID_API_KEY")
economic_sector_key = "Primary Economic Sector"
business_sector_key = "Primary Business Sector"
if not access_token:
raise ValueError("Please specify your permid key using an environment variable called PERMID_API_KEY")
for permid in permids:
# Sometimes multiple permids are available and it's not obvious to me which to pick. I'll just pick the
# first one that has a not-null value for both sectors
resp = requests.get(f"https://permid.org/api/mdaas/getEntityById/{permid}?access-token={access_token}")
if resp.status_code != 200:
print(f"Unexpected status code {resp.status_code} for {permid}")
metadata = resp.json()
economic_sector = metadata.get(economic_sector_key)
business_sector = metadata.get(business_sector_key)
if economic_sector and business_sector:
return economic_sector[0], business_sector[0]
elif economic_sector:
print(f"No business sector available for {permid}")
return "Unknown", "Unknown"


def retrieve_raw(get_links: bool) -> None:
"""
Retrieve raw data from the ai_companies_visualization dataset in BQ
Expand Down Expand Up @@ -536,7 +565,7 @@ def clean_misc_fields(js: dict, refresh_images: bool, lowercase_to_orig_cname: d
js["aliases"] = clean_aliases(js.pop("aliases"), lowercase_to_orig_cname,
orig_company_name if orig_company_name != js["name"].lower() else None)
js["stage"] = js["stage"] if js["stage"] else "Unknown"
js["permid_links"] = format_links(js.pop("permid"), "https://permid.org/1-")
js["permid_links"] = format_links(js.get("permid"), "https://permid.org/1-")
js["parent_info"] = clean_parent(js.pop("parent"), lowercase_to_orig_cname)
js["agg_child_info"] = clean_children(js.pop("children"), lowercase_to_orig_cname)
js["unagg_child_info"] = clean_children(js.pop("non_agg_children"), lowercase_to_orig_cname)
Expand Down Expand Up @@ -594,8 +623,6 @@ def get_category_counts(js: dict) -> None:
:param js: A dict of data corresponding to an individual PARAT record
:return: None (mutates js)
"""
# Spoof sector https://github.com/georgetown-cset/parat/issues/120
js["sector"] = f"Sector{js['cset_id'] % 3}"
articles = {
# spoof highly cited articles https://github.com/georgetown-cset/parat/issues/135
"highly_cited": {
Expand Down Expand Up @@ -675,6 +702,33 @@ def get_category_counts(js: dict) -> None:
js.pop(redundant_count)


def add_sectors(rows: list, refresh: bool) -> None:
"""
Adds sector to each row, updating sectors from PERMID if needed. Removes the "permid" key from the row which
is no longer needed after this function runs
:param rows: List of rows of company metadata
:param refresh: If true, will refresh sectors from PERMID
:return: None (mutates rows)
"""
if refresh:
sectors = {}
for row in rows:
econ_sector, business_sector = get_permid_sector(row.pop("permid"))
row["sector"] = econ_sector
row["business_sector"] = business_sector
sectors[row["cset_id"]] = {"economic": econ_sector, "business": business_sector}
with open(SECTOR_FI, mode="w") as f:
f.write(json.dumps(sectors))
else:
with open(SECTOR_FI) as f:
sectors = json.loads(f.read())
for row in rows:
cset_id = str(row["cset_id"])
row["sector"] = sectors[cset_id]["economic"]
row["business_sector"] = sectors[cset_id]["business"]
row.pop("permid")


def clean_row(row: str, refresh_images: bool, lowercase_to_orig_cname: dict, market_key_to_link: dict) -> dict:
"""
Given a row from a jsonl, reformat its elements into the form needed by the PARAT javascript
Expand Down Expand Up @@ -702,11 +756,12 @@ def clean_link(link: str) -> str:
return link


def clean(refresh_images: bool) -> dict:
def clean(refresh_images: bool, refresh_sectors: bool) -> dict:
"""
Reads and cleans the raw data from the local cache
:param refresh_images: if true, will re-download all the company logos from crunchbase; don't call with true
unless necessary
:param refresh_sectors: if true, will re-query the PERMID api for sector information for each company
:return: Return company-like metadata for groups
"""
rows = []
Expand All @@ -723,6 +778,7 @@ def clean(refresh_images: bool) -> dict:
with open(RAW_DATA_FI) as f:
for row in f:
rows.append(clean_row(row, refresh_images, lowercase_to_orig_cname, market_key_to_link))
add_sectors(rows, refresh_sectors)
add_supplemental_descriptions(rows)
add_ranks(rows)
company_rows, group_data = [], {}
Expand Down Expand Up @@ -775,13 +831,15 @@ def update_overall_data(group_data: dict) -> None:
help="Re-download the images; if not specified will use local cache")
parser.add_argument("--refresh_market_links", action="store_true", default=False,
help="Re-retrieve the market links (takes ~1.5 hrs); if not specified will use local cache")
parser.add_argument("--refresh_sectors", action="store_true", default=False,
help="Retrieve sector information from PERMID API; requires API key available in "
"PERMID_API_KEY environment variable")
args = parser.parse_args()

if args.refresh_market_links and not args.refresh_raw:
print("You must specify --refresh_raw if you want to refresh the market links")
exit(0)

if args.refresh_raw:
retrieve_raw(args.refresh_market_links)
group_data = clean(args.refresh_images)
group_data = clean(args.refresh_images, args.refresh_sectors)
update_overall_data(group_data)
2 changes: 1 addition & 1 deletion web/tests/test_data/alphabet_output.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{
"cset_id": 796,
"country": "United States",
"sector": "Sector1",
"website": "https://abc.xyz/",
"crunchbase": {
"text": "096694c6-bcd2-a975-b95c-fab77c81d915",
Expand Down Expand Up @@ -38,6 +37,7 @@
"continent": "North America",
"local_logo": "alphabet.png",
"aliases": "Alphabet Inc",
"permid": [5082534760, 5030853586, 5053732847, 5028044072, 5050702354],
"permid_links": [
{
"text": 5082534760,
Expand Down
2 changes: 1 addition & 1 deletion web/tests/test_data/hugging_face_output.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
{
"cset_id": 1425,
"country": "United States",
"sector": "Sector0",
"website": "https://huggingface.co/",
"crunchbase": {
"text": "b7947f18-b199-45ac-b7da-66f5c52fcfbc",
Expand All @@ -17,6 +16,7 @@
"continent": "North America",
"local_logo": "hugging_face.png",
"aliases": "Hugging Face Inc; Hugging Face, Inc",
"permid": [5063742076],
"permid_links": [
{
"text": 5063742076,
Expand Down

0 comments on commit 87b54c8

Please sign in to comment.