Skip to content

Commit

Permalink
Fix column names, integrate src/data/companies.json generation into p…
Browse files Browse the repository at this point in the history
…reprocessing script, update data
  • Loading branch information
jmelot committed May 20, 2024
1 parent 787999a commit 665aaf6
Show file tree
Hide file tree
Showing 7 changed files with 11,780 additions and 11,845 deletions.
2 changes: 1 addition & 1 deletion web/gui-v2/src/data/companies.json

Large diffs are not rendered by default.

26 changes: 0 additions & 26 deletions web/gui-v2/src/scripts/filter_companies.py

This file was deleted.

2 changes: 1 addition & 1 deletion web/gui-v2/src/static_data/data.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion web/gui-v2/src/static_data/overall_data.json

Large diffs are not rendered by default.

23,575 changes: 11,763 additions & 11,812 deletions web/raw_data/exchange_links.jsonl

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion web/raw_data/sectors.jsonl

Large diffs are not rendered by default.

16 changes: 13 additions & 3 deletions web/scripts/retrieve_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,19 +145,29 @@ def retrieve_raw(get_links: bool) -> None:
client = bigquery.Client()
market_info = set()
print("retrieving metadata")
lower_name_to_id = {}
with open(RAW_DATA_FI, mode="w") as out:
for row in client.list_rows("ai_companies_visualization.all_visualization_data"):
dict_row = {col: row[col] for col in row.keys()}
if not row["name"]:
print(f"{row['cset_id']} missing name")
continue
out.write(json.dumps(dict_row)+"\n")
market_info = market_info.union([m["exchange"]+":"+m["ticker"] for m in dict_row["market"]])
lower_name_to_id[dict_row["name"].lower()] = dict_row["cset_id"]
print("retrieving original company names")
id_and_orig_name = []
with open(ORIG_NAMES_FI, mode="w") as out:
for row in client.list_rows("ai_companies_visualization.original_company_names"):
name = row["name"]
if name is None:
continue
row = {"orig_name": name, "lowercase_name": name.lower()}
out.write(json.dumps(row)+"\n")
if name.lower() in lower_name_to_id:
id_and_orig_name.append({"cset_id": lower_name_to_id[name.lower()], "name": name})
with open(os.path.join(WEB_SRC_DIR, "data", "companies.json"), mode="w") as web_out:
web_out.write(json.dumps(id_and_orig_name))
if get_links:
print("retrieving market links")
with open(EXCHANGE_LINK_FI, mode="w") as out:
Expand Down Expand Up @@ -648,10 +658,10 @@ def get_category_counts(js: dict) -> None:
articles = {}
### Reformat publication-related metrics
for machine_name, orig_key, count_key, is_top_research in [
["all_publications", "all_pubs_by_year", "all_pubs", False],
["all_publications", "all_pubs_by_year", "year_count", False],
["ai_publications", "ai_pubs_by_year", "ai_pubs", False],
["highly_cited_ai_pubs", "highly_cited_ai_pubs_by_year", "highly_cited_ai_pubs", False],
["ai_pubs_top_conf", "ai_pubs_in_top_conferences_by_year", "ai_pubs_in_top_conferences", False],
["highly_cited_ai_pubs", "highly_cited_ai_pubs_by_year", "year_count", False],
["ai_pubs_top_conf", "ai_pubs_in_top_conferences_by_year", "year_count", False],
["ai_citation_counts", "ai_citation_count_by_year", "ai_citation_count", False],
["cv_citation_counts", "cv_citation_count_by_year", "cv_citation_count", False],
["nlp_citation_counts", "nlp_citation_count_by_year", "nlp_citation_count", False],
Expand Down

0 comments on commit 665aaf6

Please sign in to comment.