From ee6436e68d8aad81e1598fdb42d6638abeaa2947 Mon Sep 17 00:00:00 2001 From: Rebecca Date: Thu, 28 Sep 2023 12:16:53 -0400 Subject: [PATCH 01/17] Create initial segment of DAG, through python code --- company_linkage/Dockerfile | 22 ++ company_linkage/README.md | 72 +++-- company_linkage/parat_data_dag.py | 294 ++++++++++++++++++ .../aggregate_organizations.py | 23 +- .../{ => parat_scripts}/all_papers.py | 0 .../deduplicate_companies.py | 0 .../{ => parat_scripts}/get_ai_counts.py | 0 .../test_aggregate_organizations.py | 12 +- .../{ => parat_scripts}/test_ai_counts.py | 2 +- .../{ => parat_scripts}/top_papers.py | 2 +- company_linkage/push_to_airflow.sh | 12 +- company_linkage/requirements.txt | 56 ++++ ...ema.json => aggregated_organizations.json} | 0 company_linkage/sequences.txt | 0 company_linkage/sequences/initial_data.csv | 6 + ...i_publications.sql => ai_publications.sql} | 2 - ..._publications.sql => all_publications.sql} | 0 ...g_ai_patents.sql => linked_ai_patents.sql} | 1 - ...airtable_imports.sql => organizations.sql} | 2 - ...rences.sql => pubs_in_top_conferences.sql} | 2 - ...rence_pubs.sql => top_conference_pubs.sql} | 2 - 21 files changed, 450 insertions(+), 60 deletions(-) create mode 100644 company_linkage/Dockerfile create mode 100644 company_linkage/parat_data_dag.py rename company_linkage/{ => parat_scripts}/aggregate_organizations.py (98%) rename company_linkage/{ => parat_scripts}/all_papers.py (100%) rename company_linkage/{ => parat_scripts}/deduplicate_companies.py (100%) rename company_linkage/{ => parat_scripts}/get_ai_counts.py (100%) rename company_linkage/{ => parat_scripts}/test_aggregate_organizations.py (97%) rename company_linkage/{ => parat_scripts}/test_ai_counts.py (98%) rename company_linkage/{ => parat_scripts}/top_papers.py (94%) mode change 100644 => 100755 company_linkage/push_to_airflow.sh create mode 100644 company_linkage/requirements.txt rename company_linkage/schemas/{aggregated_organizations_schema.json => aggregated_organizations.json} (100%) delete mode 100644 company_linkage/sequences.txt create mode 100644 company_linkage/sequences/initial_data.csv rename company_linkage/sql/{selecting_ai_publications.sql => ai_publications.sql} (95%) rename company_linkage/sql/{selecting_all_publications.sql => all_publications.sql} (100%) rename company_linkage/sql/{selecting_ai_patents.sql => linked_ai_patents.sql} (98%) rename company_linkage/sql/{create_organizations_from_airtable_imports.sql => organizations.sql} (97%) rename company_linkage/sql/{pulling_publications_in_top_ai_conferences.sql => pubs_in_top_conferences.sql} (93%) rename company_linkage/sql/{selecting_top_conference_pubs.sql => top_conference_pubs.sql} (97%) diff --git a/company_linkage/Dockerfile b/company_linkage/Dockerfile new file mode 100644 index 00000000..fc5ed907 --- /dev/null +++ b/company_linkage/Dockerfile @@ -0,0 +1,22 @@ +FROM ubuntu:20.04 + +# Set up system dependencies +RUN apt -y update +RUN apt-get -y update +RUN apt-get install -y build-essential libssl-dev libffi-dev python3-dev python3-pip curl + +# Grab files we need to run +ADD requirements.txt /parat/requirements.txt +ADD parat_scripts/* /parat/ + +# install gsutil and put it on the path for airflow to use +ENV CLOUDSDK_INSTALL_DIR /usr/local/gcloud/ +RUN curl -sSL https://sdk.cloud.google.com | bash +ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin + +# Install python dependencies +WORKDIR /parat +ENV AIRFLOW_GPL_UNIDECODE=yes +RUN pip3 install -r requirements.txt +# Make sure the above config succeeded +RUN python3 -m pytest test_aggregate_organizations.py -k test_add_location \ No newline at end of file diff --git a/company_linkage/README.md b/company_linkage/README.md index 9dcdb1b3..47438633 100644 --- a/company_linkage/README.md +++ b/company_linkage/README.md @@ -16,37 +16,47 @@ run some of this code as-is. ## Tasks to build visualization data -1. [creating_organizations_from_airtable_imports.sql](sql/create_organizations_from_airtable_imports.sql) -2. [selecting_ai_publications.sql](sql/selecting_ai_publications.sql) -3. `python3 aggregate_organizations.py aggregated_organizations.jsonl` -4. Replace `high_resolution_entities.aggregated_organizations` with the data from `aggregated_organizations.jsonl` using the [aggregated_organizations_schema](schemas/aggregated_organizations_schema.json) -5. [selecting_ai_patents.sql](sql/selecting_ai_patents.sql) -6. `python3 get_ai_counts.py data/ai_company_papers.jsonl data/ai_company_patents.jsonl` -7. Upload `ai_company_papers.jsonl` to `ai_companies_visualization.ai_company_pubs` using the [ai_papers_schema](schemas/ai_papers_schema.json) -8. Upload `ai_company_patents.jsonl` to `ai_companies_visualization.ai_company_patents` using the [ai_patents_schema](schemas/ai_patents_schema.json) -9. [creating_initial_visualization_data_publications.sql](sql/creating_initial_visualization_data_publications.sql) -10. [adding_ai_pubs_by_year_to_visualization.sql](sql/adding_ai_pubs_by_year_to_visualization.sql) -11. [creating_patent_visualization_data.sql](sql/creating_patent_visualization_data.sql) -12. [adding_ai_patents_by_year_to_visualization.sql](sql/adding_ai_patents_by_year_to_visualization.sql) -13. [creating_paper_visualization_data.sql](sql/creating_paper_visualization_data.sql) -14. [adding_top_mag_ai_fields.sql](sql/adding_top_mag_ai_fields.sql) -15. [adding_top_science_map_clusters.sql](sql/adding_top_science_map_clusters.sql) -16. [adding_company_references.sql](sql/adding_company_references.sql) -17. [adding_top_tasks.sql](sql/adding_top_tasks.sql) -18. [adding_top_methods.sql](sql/adding_top_methods.sql) -19. [selecting_top_conference_pubs.sql](sql/selecting_top_conference_pubs.sql) -20. [pulling_publications_in_top_ai_conferences.sql](sql/pulling_publications_in_top_ai_conferences.sql) -21. `python3 top_papers.py top_paper_counts.jsonl` -22. Upload `top_paper_counts.jsonl` to `ai_companies_visualization.top_paper_counts` using the [top_papers_schema](schemas/top_papers_schema.json) -23. [adding_top_paper_counts.sql](sql/adding_top_paper_counts.sql) -24. [selecting_all_publications.sql](sql/selecting_all_publications.sql) -25. `python3 all_papers.py all_paper_counts.jsonl` -26. Upload `all_paper_counts.jsonl` to `ai_companies_visualization.total_paper_counts` using the [all_papers_schema](schemas/all_papers_schema.json) +1. [organizations.sql](sql/organizations.sql) +2. [ai_publications.sql](sql/ai_publications.sql) +3. [linked_ai_patents.sql](sql/linked_ai_patents.sql) +4. [top_conference_pubs.sql](sql/top_conference_pubs.sql) +5. [pubs_in_top_conferences.sql](sql/pubs_in_top_conferences.sql) +6. [all_publications.sql](sql/all_publications.sql) +7. `python3 aggregate_organizations.py aggregated_organizations.jsonl` +8. Replace `high_resolution_entities.aggregated_organizations` with the data from `aggregated_organizations.jsonl` using the [aggregated_organizations_schema](schemas/aggregated_organizations_schema.json) +9. `python3 get_ai_counts.py data/ai_company_papers.jsonl data/ai_company_patents.jsonl` +10. Upload `ai_company_papers.jsonl` to `ai_companies_visualization.ai_company_pubs` using the [ai_papers_schema](schemas/ai_papers_schema.json) +11. Upload `ai_company_patents.jsonl` to `ai_companies_visualization.ai_company_patents` using the [ai_patents_schema](schemas/ai_patents_schema.json) +12. `python3 top_papers.py top_paper_counts.jsonl` +13. Upload `top_paper_counts.jsonl` to `ai_companies_visualization.top_paper_counts` using the [top_papers_schema](schemas/top_papers_schema.json) +14. `python3 all_papers.py all_paper_counts.jsonl` +15. Upload `all_paper_counts.jsonl` to `ai_companies_visualization.total_paper_counts` using the [all_papers_schema](schemas/all_papers_schema.json) +16. [creating_initial_visualization_data_publications.sql](sql/creating_initial_visualization_data_publications.sql) +17. [adding_ai_pubs_by_year_to_visualization.sql](sql/adding_ai_pubs_by_year_to_visualization.sql) +18. [creating_patent_visualization_data.sql](sql/creating_patent_visualization_data.sql) +19. [adding_ai_patents_by_year_to_visualization.sql](sql/adding_ai_patents_by_year_to_visualization.sql) +20. [creating_paper_visualization_data.sql](sql/creating_paper_visualization_data.sql) +21. [adding_top_mag_ai_fields.sql](sql/adding_top_mag_ai_fields.sql) +22. [adding_top_science_map_clusters.sql](sql/adding_top_science_map_clusters.sql) +23. [adding_company_references.sql](sql/adding_company_references.sql) +24. [adding_top_tasks.sql](sql/adding_top_tasks.sql) +25. [adding_top_methods.sql](sql/adding_top_methods.sql) +26. [adding_top_paper_counts.sql](sql/adding_top_paper_counts.sql) 27. [adding_all_paper_counts.sql](sql/adding_all_paper_counts.sql) 28. [creating_workforce_visualization_data.sql](sql/creating_workforce_visualization_data.sql) 29. [adding_ai_jobs_to_workforce_visualization.sql](sql/adding_ai_jobs_to_workforce_visualization.sql) -31. [omit_by_rule.sql](sql/omit_by_rule.sql) -32. [omit_by_rule_papers.sql](sql/omit_by_rule_papers.sql) -33. [omit_by_rule_patents.sql](sql/omit_by_rule_patents.sql) -34. [omit_by_rule_workforce.sql](sql/omit_by_rule_workforce.sql) -35. [adding_crunchbase_company_metadata.sql](sql/adding_crunchbase_company_metadata.sql) \ No newline at end of file +30. [omit_by_rule.sql](sql/omit_by_rule.sql) +31. [omit_by_rule_papers.sql](sql/omit_by_rule_papers.sql) +32. [omit_by_rule_patents.sql](sql/omit_by_rule_patents.sql) +33. [omit_by_rule_workforce.sql](sql/omit_by_rule_workforce.sql) +34. [adding_crunchbase_company_metadata.sql](sql/adding_crunchbase_company_metadata.sql) + +# Deployment + +To refresh the docker container (which you must do if you change any of the python scripts in parat_scripts/), run + +``` +docker build -t parat . +docker tag parat us.gcr.io/gcp-cset-projects/parat +docker push us.gcr.io/gcp-cset-projects/parat +``` \ No newline at end of file diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py new file mode 100644 index 00000000..e64c2fca --- /dev/null +++ b/company_linkage/parat_data_dag.py @@ -0,0 +1,294 @@ +import os +from datetime import datetime + +from airflow import DAG +from airflow.operators.python import PythonOperator +from airflow.operators.trigger_dagrun import TriggerDagRunOperator +from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator +from airflow.providers.google.cloud.operators.cloud_sql import ( + CloudSQLImportInstanceOperator, +) +from airflow.providers.google.cloud.operators.kubernetes_engine import GKEStartPodOperator +from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator +from airflow.operators.dummy import DummyOperator +from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator +from airflow.providers.google.cloud.transfers.bigquery_to_gcs import ( + BigQueryToGCSOperator, +) +from dataloader.airflow_utils.defaults import ( + DATA_BUCKET, + PROJECT_ID, + GCP_ZONE, + get_default_args, + get_post_success, +) +from dataloader.scripts.populate_documentation import update_table_descriptions +from parat_scripts.aggregate_organizations import aggregate_organizations + +bucket = DATA_BUCKET +initial_dataset = "parat_input" +intermediate_dataset = "high_resolution_entities" +production_dataset = "ai_companies_visualization" +staging_intermediate_dataset = f"staging_{intermediate_dataset}" +staging_dataset = f"staging_{production_dataset}" +sql_dir = "sql/parat" +schema_dir = "schemas/parat" +tmp_dir = f"{production_dataset}/tmp" + +default_args = get_default_args() +date = datetime.now().strftime("%Y%m%d") + + +# Part 2: Get data from airtable and update databases +dag = DAG( + "parat", + default_args=default_args, + description="PARAT data updater", + schedule_interval=None, + catchup=False, + user_defined_macros={ + "staging_dataset": staging_dataset, + "production_dataset": production_dataset, + "staging_intermediate_dataset": staging_intermediate_dataset, + "intermediate_dataset": intermediate_dataset, + "initial_dataset": initial_dataset + }, +) +with dag: + + clear_tmp_dir = GCSDeleteObjectsOperator( + task_id="clear_tmp_dir", + bucket_name=DATA_BUCKET, + prefix=tmp_dir + ) + + # combine all the airtable tables into joined tables + + start = DummyOperator(task_id="starting") + + join_tables = [] + for table in ["alias", "grid", "ids", "linkedin", "market", "organizations", "parent", "permid"]: + + # Grab all the data and write it to unseen_en_corpus + join_table = BigQueryInsertJobOperator( + task_id=f"join_{table}", + configuration={ + "query": { + "query": f"select distinct * from {initial_dataset}.{table}_preannotation UNION DISTINCT " + f"select distinct * from {initial_dataset}.{table}_validate", + "useLegacySql": False, + "destinationTable": { + "projectId": PROJECT_ID, + "datasetId": initial_dataset, + "tableId": f"{table}_joined" + }, + "allowLargeResults": True, + "createDisposition": "CREATE_IF_NEEDED", + "writeDisposition": "WRITE_TRUNCATE" + } + } + ) + join_tables.append(join_table) + + # Do initial query sequence + + start_initial_tables = DummyOperator(task_id="start_initial_tables") + + wait_for_initial_tables = DummyOperator(task_id="wait_for_initial_tables") + + seq_path_prefix = f"{os.environ.get('DAGS_FOLDER')}/sequences/parat/" + initial_query_sequence = "initial_data.csv" + + curr = start_initial_tables + for line in open(seq_path_prefix + initial_query_sequence).readlines(): + dataset, table = line.split(",") + staging_table_name = f"staging_{dataset}.{table.strip()}" + next = BigQueryInsertJobOperator( + task_id="create_"+staging_table_name, + configuration={ + "query": { + "query": "{% include '" + f"{sql_dir}/{table.strip()}.sql" + "' %}", + "useLegacySql": False, + "destinationTable": { + "projectId": PROJECT_ID, + "datasetId": staging_dataset, + "tableId": table + }, + "allowLargeResults": True, + "createDisposition": "CREATE_IF_NEEDED", + "writeDisposition": "WRITE_TRUNCATE" + } + }, + ) + curr >> next + curr = next + curr >> wait_for_initial_tables + + # run aggregate_organizations python and load to GCS + aggregated_table = "aggregated_organizations" + + aggregate_organizations = PythonOperator( + task_id="aggregate_organizations", + op_kwargs={ + "output_file": f"{aggregated_table}.jsonl" + }, + python_callable=aggregate_organizations, + ) + + # load aggregated_organizations to BigQuery + + load_aggregated_orgs = GCSToBigQueryOperator( + task_id=f"load_{aggregated_table}", + bucket=DATA_BUCKET, + source_objects=[f"{aggregated_table}.jsonl"], + schema_object=f"{schema_dir}/{aggregated_table}.json", + destination_project_dataset_table=f"{staging_intermediate_dataset}.{aggregated_table}", + source_format="NEWLINE_DELIMITED_JSON", + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + + # TODO: somewhere in here we need to decide whether to load directly to the main table + # or to add a transfer step to transfer from staging to the main table; if the latter + # are there checks we want to add first? + # for now, pretend the data is in the main table already + + run_get_ai_counts = GKEStartPodOperator( + task_id="run_get_ai_counts", + project_id=PROJECT_ID, + location=GCP_ZONE, + cluster_name="us-east1-production2023-cc1-01d75926-gke", + name="run_get_ai_counts", + cmds=["/bin/bash"], + arguments=["-c", (f"echo 'getting AI counts!' ; rm -r ai || true ; " + f"mkdir -p ai && " + f"python3 get_ai_counts.py ai/ai_company_papers.jsonl ai/ai_company_patents.jsonl" + f"gsutil -m cp -r ai gs://{DATA_BUCKET}/{tmp_dir}/ ")], + namespace="default", + image=f"us.gcr.io/{PROJECT_ID}/parat", + get_logs=True, + startup_timeout_seconds=300, + # see also https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator#affinity-config + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [{ + "matchExpressions": [{ + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": [ + "default-pool", + ] + }] + }] + } + } + } + ) + + load_ai_papers = GCSToBigQueryOperator( + task_id=f"load_ai_company_papers", + bucket=DATA_BUCKET, + source_objects=["ai_company_papers.jsonl"], + schema_object=f"{schema_dir}/ai_papers_schema.json", + destination_project_dataset_table=f"{staging_intermediate_dataset}.ai_company_papers", + source_format="NEWLINE_DELIMITED_JSON", + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + + load_ai_patents = GCSToBigQueryOperator( + task_id=f"load_ai_company_patents", + bucket=DATA_BUCKET, + source_objects=["ai_company_patents.jsonl"], + schema_object=f"{schema_dir}/ai_patents_schema.json", + destination_project_dataset_table=f"{staging_intermediate_dataset}.ai_company_patents", + source_format="NEWLINE_DELIMITED_JSON", + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + + run_papers = [] + for paper_type in ["top", "all"]: + + run_get_paper_counts = GKEStartPodOperator( + task_id=f"run_get_{paper_type}_counts", + project_id=PROJECT_ID, + location=GCP_ZONE, + cluster_name="us-east1-production2023-cc1-01d75926-gke", + name=f"run_get_{paper_type}_counts", + cmds=["/bin/bash"], + arguments=["-c", (f"echo 'getting {paper_type} paper counts!' ; rm -r {paper_type} || true ; " + f"mkdir -p {paper_type} && " + f"python3 {paper_type}_papers.py {paper_type}/{paper_type}_paper_counts.jsonl" + f"gsutil -m cp -r {paper_type} gs://{DATA_BUCKET}/{tmp_dir}/ ")], + namespace="default", + image=f"us.gcr.io/{PROJECT_ID}/parat", + get_logs=True, + startup_timeout_seconds=300, + # see also https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator#affinity-config + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [{ + "matchExpressions": [{ + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": [ + "default-pool", + ] + }] + }] + } + } + } + ) + run_papers.append(run_get_paper_counts) + + # even though these are near-identical we do these in sequence -- we'd have to put in a dummy operator + # otherwise anyway and they should be fast + + load_top_papers = GCSToBigQueryOperator( + task_id=f"load_top_papers", + bucket=DATA_BUCKET, + source_objects=["top_paper_counts.jsonl"], + schema_object=f"{schema_dir}/top_papers_schema.json", + destination_project_dataset_table=f"{staging_intermediate_dataset}.top_paper_counts", + source_format="NEWLINE_DELIMITED_JSON", + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + + load_all_papers = GCSToBigQueryOperator( + task_id=f"load_all_papers", + bucket=DATA_BUCKET, + source_objects=["all_paper_counts.jsonl"], + schema_object=f"{schema_dir}/all_papers_schema.json", + destination_project_dataset_table=f"{staging_intermediate_dataset}.all_paper_counts", + source_format="NEWLINE_DELIMITED_JSON", + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + + + + + + ( + clear_tmp_dir + >> start + >> join_tables + >> start_initial_tables + ) + ( + wait_for_initial_tables + >> aggregate_organizations + >> load_aggregated_orgs + >> run_get_ai_counts + >> load_ai_papers + >> load_ai_patents + >> run_papers + >> load_top_papers + >> load_all_papers + ) + diff --git a/company_linkage/aggregate_organizations.py b/company_linkage/parat_scripts/aggregate_organizations.py similarity index 98% rename from company_linkage/aggregate_organizations.py rename to company_linkage/parat_scripts/aggregate_organizations.py index 0029ed62..424ea404 100644 --- a/company_linkage/aggregate_organizations.py +++ b/company_linkage/parat_scripts/aggregate_organizations.py @@ -2,6 +2,7 @@ from google.cloud import bigquery import json from collections import defaultdict +import subprocess # List of companies not being aggregated # note: check https://docs.google.com/spreadsheets/d/1Tq28O8qIA6T3AJ5oTHKCcscaNZsY_E4OPOUm6JaiwWA/edit#gid=0 @@ -394,7 +395,7 @@ def update_organization_data(self, org, org_id): org_info.add_sandp(org["in_sandp_500"]) org_info.add_fortune(org["in_fortune_global_500"]) - def print_output(self, output_file): + def print_output(self, output_file, local): """ Writing the aggregated organization output to file :param output_file: The output file we're writing to @@ -414,18 +415,22 @@ def print_output(self, output_file): "non_agg_children": org_info.non_agg_children} out.write(json.dumps(js, ensure_ascii=False) + "\n") out.close() + if not local: + subprocess.run(["gsutil", "-m", "cp", "-r", output_file, "gs://parat/"], check=True) -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("output_file", type=str, help="A jsonl file for writing output data to create new tables") - args = parser.parse_args() - if not args.output_file.endswith(".jsonl"): - parser.print_help() + +def aggregate_organizations(output_file, local=False): aggregator = OrganizationAggregator() aggregator.get_parents() aggregator.get_organizations() - aggregator.print_output(args.output_file) + aggregator.print_output(output_file, local) if __name__ == "__main__": - main() + parser = argparse.ArgumentParser() + parser.add_argument("output_file", type=str, help="A jsonl file for writing output data to create new tables") + args = parser.parse_args() + if not args.output_file.endswith(".jsonl"): + parser.print_help() + aggregate_organizations(args.output_file, local=True) + diff --git a/company_linkage/all_papers.py b/company_linkage/parat_scripts/all_papers.py similarity index 100% rename from company_linkage/all_papers.py rename to company_linkage/parat_scripts/all_papers.py diff --git a/company_linkage/deduplicate_companies.py b/company_linkage/parat_scripts/deduplicate_companies.py similarity index 100% rename from company_linkage/deduplicate_companies.py rename to company_linkage/parat_scripts/deduplicate_companies.py diff --git a/company_linkage/get_ai_counts.py b/company_linkage/parat_scripts/get_ai_counts.py similarity index 100% rename from company_linkage/get_ai_counts.py rename to company_linkage/parat_scripts/get_ai_counts.py diff --git a/company_linkage/test_aggregate_organizations.py b/company_linkage/parat_scripts/test_aggregate_organizations.py similarity index 97% rename from company_linkage/test_aggregate_organizations.py rename to company_linkage/parat_scripts/test_aggregate_organizations.py index 76709ac6..c0287b21 100644 --- a/company_linkage/test_aggregate_organizations.py +++ b/company_linkage/parat_scripts/test_aggregate_organizations.py @@ -1,6 +1,6 @@ import os import unittest -from company_linkage import aggregate_organizations +import aggregate_organizations from collections import defaultdict @@ -131,15 +131,15 @@ def test_add_grid(self): def test_add_regex(self): org = aggregate_organizations.Organization(1, "test") - org.add_regex("^hhi\s+corporation$|^hhi$|^hhi\s+corp$") - self.assertEqual(org.regex[0], "^hhi\s+corporation$|^hhi$|^hhi\s+corp$") + org.add_regex(r"^hhi\s+corporation$|^hhi$|^hhi\s+corp$") + self.assertEqual(org.regex[0], r"^hhi\s+corporation$|^hhi$|^hhi\s+corp$") self.assertEqual(len(org.regex), 1) # Don't add a duplicate entry! - org.add_regex("^hhi\s+corporation$|^hhi$|^hhi\s+corp$") + org.add_regex(r"^hhi\s+corporation$|^hhi$|^hhi\s+corp$") self.assertEqual(len(org.regex), 1) # Do add a new one - org.add_regex("^hhi\s+corporation$") - self.assertEqual(org.regex[1], "^hhi\s+corporation$") + org.add_regex(r"^hhi\s+corporation$") + self.assertEqual(org.regex[1], r"^hhi\s+corporation$") self.assertEqual(len(org.regex), 2) def test_add_bgov_id(self): diff --git a/company_linkage/test_ai_counts.py b/company_linkage/parat_scripts/test_ai_counts.py similarity index 98% rename from company_linkage/test_ai_counts.py rename to company_linkage/parat_scripts/test_ai_counts.py index 83bf622b..3845ffff 100644 --- a/company_linkage/test_ai_counts.py +++ b/company_linkage/parat_scripts/test_ai_counts.py @@ -1,5 +1,5 @@ import unittest -from company_linkage.get_ai_counts import CountGetter +from get_ai_counts import CountGetter import warnings diff --git a/company_linkage/top_papers.py b/company_linkage/parat_scripts/top_papers.py similarity index 94% rename from company_linkage/top_papers.py rename to company_linkage/parat_scripts/top_papers.py index 962bcf97..a11da6af 100644 --- a/company_linkage/top_papers.py +++ b/company_linkage/parat_scripts/top_papers.py @@ -1,6 +1,6 @@ import argparse -from company_linkage.get_ai_counts import CountGetter +from get_ai_counts import CountGetter def main() -> None: diff --git a/company_linkage/push_to_airflow.sh b/company_linkage/push_to_airflow.sh old mode 100644 new mode 100755 index ca988b32..7d3813b7 --- a/company_linkage/push_to_airflow.sh +++ b/company_linkage/push_to_airflow.sh @@ -3,10 +3,16 @@ gsutil rm -r gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/airtable_to_bq_confi gsutil cp -r airtable_configs/parat_preannotation gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/airtable_to_bq_config/ gsutil cp -r airtable_configs/parat_validate gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/airtable_to_bq_config/ -gsutil cp airtable_queries/parat_preannotation/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/bq_to_airtable/parat_preannotation/ -gsutil cp airtable_queries/parat_preannotation/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/bq_to_airtable/parat_validate/ gsutil cp airtable_queries/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/airtable_to_bq/parat_preannotation/ gsutil cp airtable_queries/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/airtable_to_bq/parat_validate/ gsutil cp airtable_schemas/parat_preannotation/* gs://airflow-data-exchange-development/schemas/airtable_to_bq/parat_preannotation/ -gsutil cp airtable_schemas/parat_validate/* gs://airflow-data-exchange-development/schemas/airtable_to_bq/parat_validate/ \ No newline at end of file +gsutil cp airtable_schemas/parat_validate/* gs://airflow-data-exchange-development/schemas/airtable_to_bq/parat_validate/ + +gsutil cp parat_data_dag.py gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/ +gsutil cp aggregate_organizations.py gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/ +gsutil cp sequences/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sequences/parat/ +gsutil rm gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/parat/* +gsutil cp sql/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/parat/ +gsutil cp schemas/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/schemas/parat/ +gsutil -m cp -r parat_scripts/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/parat_scripts/ \ No newline at end of file diff --git a/company_linkage/requirements.txt b/company_linkage/requirements.txt new file mode 100644 index 00000000..10b51281 --- /dev/null +++ b/company_linkage/requirements.txt @@ -0,0 +1,56 @@ +attrs==21.2.0 +cachetools==4.1.1 +certifi==2020.6.20 +cffi==1.14.3 +chardet==3.0.4 +coverage==5.5 +google-api-core==1.30.0 +google-auth==1.30.2 +google-auth-oauthlib==0.4.4 +google-cloud-bigquery==2.20.0 +google-cloud-bigquery-storage==2.4.0 +google-cloud-core==1.6.0 +google-cloud-translate==3.2.0 +google-crc32c==1.1.2 +google-resumable-media==1.3.0 +googleapis-common-protos==1.53.0 +grpcio==1.33.1 +idna==2.10 +iniconfig==1.1.1 +libcst==0.3.13 +mypy-extensions==0.4.3 +numpy==1.20.3 +oauthlib==3.1.0 +packaging==20.9 +pandas==1.1.3 +pandas-gbq==0.14.0 +Pillow==8.2.0 +pluggy==0.13.1 +pprintpp==0.4.0 +proto-plus==1.11.0 +protobuf==3.13.0 +py==1.10.0 +pyarrow==3.0.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycld2==0.41 +pycountry==20.7.3 +pycountry-convert==0.7.2 +pycparser==2.20 +pydata-google-auth==1.1.0 +pyparsing==2.4.7 +pytest==6.2.4 +pytest-cov==2.12.1 +pytest-mock==3.6.1 +python-dateutil==2.8.1 +pytz==2020.1 +PyYAML==5.3.1 +repoze.lru==0.7 +requests==2.24.0 +requests-oauthlib==1.3.0 +rsa==4.6 +six==1.15.0 +toml==0.10.2 +typing-extensions==3.7.4.3 +typing-inspect==0.6.0 +urllib3==1.25.11 diff --git a/company_linkage/schemas/aggregated_organizations_schema.json b/company_linkage/schemas/aggregated_organizations.json similarity index 100% rename from company_linkage/schemas/aggregated_organizations_schema.json rename to company_linkage/schemas/aggregated_organizations.json diff --git a/company_linkage/sequences.txt b/company_linkage/sequences.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/company_linkage/sequences/initial_data.csv b/company_linkage/sequences/initial_data.csv new file mode 100644 index 00000000..a6e65b82 --- /dev/null +++ b/company_linkage/sequences/initial_data.csv @@ -0,0 +1,6 @@ +high_resolution_entities,organizations +ai_companies_visualization,ai_publications +ai_companies_visualization,linked_ai_patents +ai_companies_visualization,top_conference_pubs +ai_companies_visualization,pubs_in_top_conferences +ai_companies_visualization,all_publications \ No newline at end of file diff --git a/company_linkage/sql/selecting_ai_publications.sql b/company_linkage/sql/ai_publications.sql similarity index 95% rename from company_linkage/sql/selecting_ai_publications.sql rename to company_linkage/sql/ai_publications.sql index 6877d104..75a0b0f6 100644 --- a/company_linkage/sql/selecting_ai_publications.sql +++ b/company_linkage/sql/ai_publications.sql @@ -1,8 +1,6 @@ -- Pulling every AI-associated publication id linked to every grid id and every organization name -- We also include years because we'll want those later for yearly counts -- and cv/robotics/nlp so we can filter on these -CREATE OR REPLACE TABLE - ai_companies_visualization.ai_publications AS WITH ai_papers AS ( SELECT diff --git a/company_linkage/sql/selecting_all_publications.sql b/company_linkage/sql/all_publications.sql similarity index 100% rename from company_linkage/sql/selecting_all_publications.sql rename to company_linkage/sql/all_publications.sql diff --git a/company_linkage/sql/selecting_ai_patents.sql b/company_linkage/sql/linked_ai_patents.sql similarity index 98% rename from company_linkage/sql/selecting_ai_patents.sql rename to company_linkage/sql/linked_ai_patents.sql index e5db1fff..e667c310 100644 --- a/company_linkage/sql/selecting_ai_patents.sql +++ b/company_linkage/sql/linked_ai_patents.sql @@ -1,7 +1,6 @@ -- Pulling every AI-associated patent family id linked to every grid id of any assignee for that patent, and all the assignee names -- We also pull in the AI subcategories and the years -- We also attempt to add in "fake" families for the patents that are missing patent families -create or replace table ai_companies_visualization.linked_ai_patents as with patents_orig as ( SELECT -- Pulling in the current assignee grid ids from dimensions diff --git a/company_linkage/sql/create_organizations_from_airtable_imports.sql b/company_linkage/sql/organizations.sql similarity index 97% rename from company_linkage/sql/create_organizations_from_airtable_imports.sql rename to company_linkage/sql/organizations.sql index 0fa42f1d..60f875d5 100644 --- a/company_linkage/sql/create_organizations_from_airtable_imports.sql +++ b/company_linkage/sql/organizations.sql @@ -1,5 +1,3 @@ -CREATE OR REPLACE TABLE - high_resolution_entities.organizations AS SELECT * REPLACE( ( SELECT diff --git a/company_linkage/sql/pulling_publications_in_top_ai_conferences.sql b/company_linkage/sql/pubs_in_top_conferences.sql similarity index 93% rename from company_linkage/sql/pulling_publications_in_top_ai_conferences.sql rename to company_linkage/sql/pubs_in_top_conferences.sql index 08487408..cc1bee22 100644 --- a/company_linkage/sql/pulling_publications_in_top_ai_conferences.sql +++ b/company_linkage/sql/pubs_in_top_conferences.sql @@ -1,5 +1,3 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.pubs_in_top_conferences AS WITH -- Associating GRIDs to the merged paper ids affils AS ( diff --git a/company_linkage/sql/selecting_top_conference_pubs.sql b/company_linkage/sql/top_conference_pubs.sql similarity index 97% rename from company_linkage/sql/selecting_top_conference_pubs.sql rename to company_linkage/sql/top_conference_pubs.sql index af563615..25993325 100644 --- a/company_linkage/sql/selecting_top_conference_pubs.sql +++ b/company_linkage/sql/top_conference_pubs.sql @@ -1,5 +1,3 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.top_conference_pubs AS WITH venues AS ( SELECT From 5127270e49d92efae458c07ef7843c3519c2715b Mon Sep 17 00:00:00 2001 From: Rebecca Date: Thu, 28 Sep 2023 12:49:54 -0400 Subject: [PATCH 02/17] Update queries needed pre-python scripts for great update --- company_linkage/sql/ai_publications.sql | 32 +++++++++++------------ company_linkage/sql/linked_ai_patents.sql | 14 +++++----- company_linkage/sql/organizations.sql | 14 ++++++---- 3 files changed, 32 insertions(+), 28 deletions(-) diff --git a/company_linkage/sql/ai_publications.sql b/company_linkage/sql/ai_publications.sql index 75a0b0f6..2aacb37f 100644 --- a/company_linkage/sql/ai_publications.sql +++ b/company_linkage/sql/ai_publications.sql @@ -12,26 +12,26 @@ WITH gcp-cset-projects.article_classification.predictions WHERE ai_filtered = TRUE OR cv_filtered = TRUE OR nlp_filtered = TRUE OR robotics_filtered = TRUE), - gr AS ( - -- Adding in org names and country data using GRID + ror AS ( + -- Adding in org names and country data using ROR SELECT id, name AS org_name, - country_name AS country + country.country_name AS country FROM - gcp-cset-projects.gcp_cset_grid.api_grid), - merged_grids AS ( - -- Selecting all the merged ids and grid ids from the links table + gcp_cset_ror.ror), + merged_rors AS ( + -- Selecting all the merged ids and ror ids from the literature table SELECT DISTINCT merged_id, - grid_id, + ror_id, org_name, cv_filtered as cv, nlp_filtered as nlp, robotics_filtered as robotics FROM - `gcp-cset-projects.gcp_cset_links_v2.paper_affiliations_merged` + literature.affiliations -- if they're AI papers INNER JOIN ai_papers USING (merged_id)), @@ -40,20 +40,20 @@ WITH merged_id, year FROM - `gcp-cset-projects.gcp_cset_links_v2.corpus_merged`) + literature.papers) SELECT - -- Adding in the org name and country associated with the grid id - merged_grids.* EXCEPT (org_name), - COALESCE(gr.org_name, merged_grids.org_name) as org_name, + -- Adding in the org name and country associated with the ror id + merged_rors.* EXCEPT (org_name), + COALESCE(ror.org_name, merged_rors.org_name) as org_name, country, year FROM - merged_grids + merged_rors LEFT JOIN - gr + ror ON - merged_grids.Grid_ID = gr.id + merged_rors.ror_id = ror.id LEFT JOIN article_years ON - merged_grids.merged_id = article_years.merged_id \ No newline at end of file + merged_rors.merged_id = article_years.merged_id \ No newline at end of file diff --git a/company_linkage/sql/linked_ai_patents.sql b/company_linkage/sql/linked_ai_patents.sql index e667c310..ee7360b2 100644 --- a/company_linkage/sql/linked_ai_patents.sql +++ b/company_linkage/sql/linked_ai_patents.sql @@ -3,13 +3,13 @@ -- We also attempt to add in "fake" families for the patents that are missing patent families with patents_orig as ( SELECT - -- Pulling in the current assignee grid ids from dimensions + -- Pulling in the current assignee ror ids from dimensions patent_id, family_id, assignee, - grid + ror_id FROM - `gcp-cset-projects.unified_patents.normalized_patent_assignees`), + unified_patents.assignees_normalized), all_ai as ( -- Selecting all the family ids and patent IDs to get AI patents -- Also select the year so we can get counts by year @@ -50,13 +50,13 @@ all_ai as ( Machine_Learning, Search_Methods FROM - gcp-cset-projects.unified_patents.ai_patents), + unified_patents.ai_patents), patent_years as ( SELECT patent_id, EXTRACT(year FROM first_priority_date) as priority_year FROM - gcp-cset-projects.unified_patents.patent_dates + unified_patents.dates ) SELECT DISTINCT @@ -65,7 +65,7 @@ all_ai as ( -- We're just doing this so our counts aren't blank COALESCE(family_id, "X-" || patent_id) as family_id, assignee, - grid, + ror_id, MIN(priority_year) as priority_year, LOGICAL_OR(Physical_Sciences_and_Engineering) as Physical_Sciences_and_Engineering, LOGICAL_OR(Life_Sciences) as Life_Sciences, @@ -110,6 +110,6 @@ all_ai as ( USING (patent_id)) WHERE priority_year IS NOT NULL GROUP BY - grid, + ror_id, assignee, family_id \ No newline at end of file diff --git a/company_linkage/sql/organizations.sql b/company_linkage/sql/organizations.sql index 60f875d5..b93bf8b1 100644 --- a/company_linkage/sql/organizations.sql +++ b/company_linkage/sql/organizations.sql @@ -35,7 +35,7 @@ FROM ( organizations_joined.name, STRUCT(city, province_state, - country) AS location, + organizations_joined.country) AS location, website, ARRAY_AGG(STRUCT(alias_language, alias)) AS aliases, @@ -52,9 +52,9 @@ FROM ( ticker)) AS market, STRUCT(crunchbase_uuid, crunchbase_url) AS crunchbase, - ARRAY_AGG(DISTINCT grid IGNORE NULLS) AS grid, + ARRAY_AGG(DISTINCT ror.id IGNORE NULLS) AS ror_id, regex, - ARRAY_AGG(DISTINCT bgov IGNORE NULLS) AS BGOV_id, + ARRAY_AGG(DISTINCT bgov_id IGNORE NULLS) AS BGOV_id, linkedin, CASE WHEN in_sandp_500 IS TRUE THEN TRUE @@ -68,7 +68,7 @@ FROM ( FALSE END AS in_fortune_global_500, - comment + ids_joined.comment FROM parat_input.organizations_joined LEFT JOIN @@ -103,12 +103,16 @@ FROM ( parat_input.linkedin_joined USING (CSET_id) + LEFT JOIN + gcp_cset_ror.ror + ON + grid_joined.grid = external_ids.GRID.all GROUP BY CSET_id, name, city, province_state, - country, + organizations_joined.country, website, crunchbase_uuid, crunchbase_url, From 770612b0e07301da939f7c736f80624cf35202c9 Mon Sep 17 00:00:00 2001 From: Rebecca Date: Tue, 10 Oct 2023 14:53:52 -0400 Subject: [PATCH 03/17] Get organization aggregation working; update staging --- company_linkage/parat_data_dag.py | 36 +++++++++---------- .../parat_scripts/aggregate_organizations.py | 20 +++++------ .../test_aggregate_organizations.py | 24 ++++++------- company_linkage/push_to_airflow.sh | 2 ++ .../schemas/aggregated_organizations.json | 4 +-- company_linkage/sequences/initial_data.csv | 10 +++--- .../sequences/visualization_data.csv | 1 + company_linkage/sql/all_publications.sql | 4 +-- .../sql/pubs_in_top_conferences.sql | 2 +- 9 files changed, 51 insertions(+), 52 deletions(-) create mode 100644 company_linkage/sequences/visualization_data.csv diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py index e64c2fca..2416ff86 100644 --- a/company_linkage/parat_data_dag.py +++ b/company_linkage/parat_data_dag.py @@ -29,10 +29,9 @@ initial_dataset = "parat_input" intermediate_dataset = "high_resolution_entities" production_dataset = "ai_companies_visualization" -staging_intermediate_dataset = f"staging_{intermediate_dataset}" staging_dataset = f"staging_{production_dataset}" sql_dir = "sql/parat" -schema_dir = "schemas/parat" +schema_dir = "parat/schemas" tmp_dir = f"{production_dataset}/tmp" default_args = get_default_args() @@ -49,7 +48,6 @@ user_defined_macros={ "staging_dataset": staging_dataset, "production_dataset": production_dataset, - "staging_intermediate_dataset": staging_intermediate_dataset, "intermediate_dataset": intermediate_dataset, "initial_dataset": initial_dataset }, @@ -102,16 +100,16 @@ curr = start_initial_tables for line in open(seq_path_prefix + initial_query_sequence).readlines(): dataset, table = line.split(",") - staging_table_name = f"staging_{dataset}.{table.strip()}" - next = BigQueryInsertJobOperator( - task_id="create_"+staging_table_name, + table_name = f"{dataset}.{table.strip()}" + next_tab = BigQueryInsertJobOperator( + task_id=f"create_{table_name}", configuration={ "query": { "query": "{% include '" + f"{sql_dir}/{table.strip()}.sql" + "' %}", "useLegacySql": False, "destinationTable": { "projectId": PROJECT_ID, - "datasetId": staging_dataset, + "datasetId": dataset, "tableId": table }, "allowLargeResults": True, @@ -120,8 +118,8 @@ } }, ) - curr >> next - curr = next + curr >> next_tab + curr = next_tab curr >> wait_for_initial_tables # run aggregate_organizations python and load to GCS @@ -140,9 +138,9 @@ load_aggregated_orgs = GCSToBigQueryOperator( task_id=f"load_{aggregated_table}", bucket=DATA_BUCKET, - source_objects=[f"{aggregated_table}.jsonl"], + source_objects=[f"{tmp_dir}/{aggregated_table}.jsonl"], schema_object=f"{schema_dir}/{aggregated_table}.json", - destination_project_dataset_table=f"{staging_intermediate_dataset}.{aggregated_table}", + destination_project_dataset_table=f"{intermediate_dataset}.{aggregated_table}", source_format="NEWLINE_DELIMITED_JSON", create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE" @@ -189,9 +187,9 @@ load_ai_papers = GCSToBigQueryOperator( task_id=f"load_ai_company_papers", bucket=DATA_BUCKET, - source_objects=["ai_company_papers.jsonl"], + source_objects=[f"{tmp_dir}/ai_company_papers.jsonl"], schema_object=f"{schema_dir}/ai_papers_schema.json", - destination_project_dataset_table=f"{staging_intermediate_dataset}.ai_company_papers", + destination_project_dataset_table=f"{staging_dataset}.ai_company_papers", source_format="NEWLINE_DELIMITED_JSON", create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE" @@ -200,9 +198,9 @@ load_ai_patents = GCSToBigQueryOperator( task_id=f"load_ai_company_patents", bucket=DATA_BUCKET, - source_objects=["ai_company_patents.jsonl"], + source_objects=[f"{tmp_dir}/ai_company_patents.jsonl"], schema_object=f"{schema_dir}/ai_patents_schema.json", - destination_project_dataset_table=f"{staging_intermediate_dataset}.ai_company_patents", + destination_project_dataset_table=f"{staging_dataset}.ai_company_patents", source_format="NEWLINE_DELIMITED_JSON", create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE" @@ -251,9 +249,9 @@ load_top_papers = GCSToBigQueryOperator( task_id=f"load_top_papers", bucket=DATA_BUCKET, - source_objects=["top_paper_counts.jsonl"], + source_objects=[f"{tmp_dir}/top_paper_counts.jsonl"], schema_object=f"{schema_dir}/top_papers_schema.json", - destination_project_dataset_table=f"{staging_intermediate_dataset}.top_paper_counts", + destination_project_dataset_table=f"{staging_dataset}.top_paper_counts", source_format="NEWLINE_DELIMITED_JSON", create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE" @@ -262,9 +260,9 @@ load_all_papers = GCSToBigQueryOperator( task_id=f"load_all_papers", bucket=DATA_BUCKET, - source_objects=["all_paper_counts.jsonl"], + source_objects=[f"{tmp_dir}/all_paper_counts.jsonl"], schema_object=f"{schema_dir}/all_papers_schema.json", - destination_project_dataset_table=f"{staging_intermediate_dataset}.all_paper_counts", + destination_project_dataset_table=f"{staging_dataset}.all_paper_counts", source_format="NEWLINE_DELIMITED_JSON", create_disposition="CREATE_IF_NEEDED", write_disposition="WRITE_TRUNCATE" diff --git a/company_linkage/parat_scripts/aggregate_organizations.py b/company_linkage/parat_scripts/aggregate_organizations.py index 424ea404..43dcbddc 100644 --- a/company_linkage/parat_scripts/aggregate_organizations.py +++ b/company_linkage/parat_scripts/aggregate_organizations.py @@ -23,7 +23,7 @@ def __init__(self, cset_id, name): self.market = [] self.crunchbase = {} self.child_crunchbase = [] - self.grid = [] + self.ror = [] self.regex = [] self.bgov_id = [] self.comment = None @@ -130,14 +130,14 @@ def add_child_crunchbase(self, uuid, url): if crunchbase not in self.child_crunchbase and crunchbase != self.crunchbase: self.child_crunchbase.append(crunchbase) - def add_grid(self, grid): + def add_ror(self, ror): """ - Adding GRID (from grid.ac) for aggregation - :param grid: grid value + Adding ROR for aggregation + :param ror: ror value :return: """ - if grid and grid not in self.grid: - self.grid.append(grid) + if ror and ror not in self.ror: + self.ror.append(ror) def add_regex(self, regex): """ @@ -369,8 +369,8 @@ def update_organization_identifiers(self, org, org_id): org_info.add_child_crunchbase(org["crunchbase"]["crunchbase_uuid"], org["crunchbase"]["crunchbase_url"]) else: org_info.add_crunchbase(org["crunchbase"]["crunchbase_uuid"], org["crunchbase"]["crunchbase_url"]) - for grid in org["grid"]: - org_info.add_grid(grid) + for ror in org["ror_id"]: + org_info.add_ror(ror) org_info.add_regex(org["regex"]) org_info.add_linkedin(org["linkedin"]) org_info.add_bgov_id(org["BGOV_id"]) @@ -408,7 +408,7 @@ def print_output(self, output_file, local): "aliases": org_info.aliases, "parent": org_info.parent, "permid": org_info.permid, "market": org_info.market, "crunchbase": org_info.crunchbase, "child_crunchbase": org_info.child_crunchbase, - "grid": org_info.grid, "regex": org_info.regex, + "ror_id": org_info.ror, "regex": org_info.regex, "BGOV_id": org_info.bgov_id, "linkedin": org_info.linkedin, "in_sandp_500": org_info.in_sandp_500, "in_fortune_global_500": org_info.in_fortune_global_500, "comment": org_info.comment, "children": org_info.children, @@ -416,7 +416,7 @@ def print_output(self, output_file, local): out.write(json.dumps(js, ensure_ascii=False) + "\n") out.close() if not local: - subprocess.run(["gsutil", "-m", "cp", "-r", output_file, "gs://parat/"], check=True) + subprocess.run(["gsutil", "-m", "cp", "-r", output_file, "gs://airflow-data-exchange/ai_companies_visualization/tmp/"], check=True) def aggregate_organizations(output_file, local=False): diff --git a/company_linkage/parat_scripts/test_aggregate_organizations.py b/company_linkage/parat_scripts/test_aggregate_organizations.py index c0287b21..c9e31397 100644 --- a/company_linkage/parat_scripts/test_aggregate_organizations.py +++ b/company_linkage/parat_scripts/test_aggregate_organizations.py @@ -18,7 +18,7 @@ def test_init(self): self.assertEqual(org.market, []) self.assertEqual(org.crunchbase, {}) self.assertEqual(org.child_crunchbase, []) - self.assertEqual(org.grid, []) + self.assertEqual(org.ror, []) self.assertEqual(org.regex, []) self.assertEqual(org.bgov_id, []) self.assertEqual(org.comment, None) @@ -116,18 +116,18 @@ def test_add_child_crunchbase(self): "https://www.crunchbase.com/organization/algorithmia") self.assertEqual(len(org.child_crunchbase), 2) - def test_add_grid(self): + def test_add_ror(self): org = aggregate_organizations.Organization(1, "test") - org.add_grid("grid.419660.c") - self.assertEqual(org.grid[0], "grid.419660.c") - self.assertEqual(len(org.grid), 1) + org.add_ror("https://ror.org/05a8p8995") + self.assertEqual(org.ror[0], "https://ror.org/05a8p8995") + self.assertEqual(len(org.ror), 1) # Don't add a duplicate entry! - org.add_grid("grid.419660.c") - self.assertEqual(len(org.grid), 1) + org.add_ror("https://ror.org/05a8p8995") + self.assertEqual(len(org.ror), 1) # Do add a new one - org.add_grid("grid.481863.0") - self.assertEqual(org.grid[1], "grid.481863.0") - self.assertEqual(len(org.grid), 2) + org.add_ror("https://ror.org/00kdbj440") + self.assertEqual(org.ror[1], "https://ror.org/00kdbj440") + self.assertEqual(len(org.ror), 2) def test_add_regex(self): org = aggregate_organizations.Organization(1, "test") @@ -157,8 +157,8 @@ def test_add_bgov_id(self): def test_add_comment(self): org = aggregate_organizations.Organization(1, "test") - org.add_comment("grid id not available") - self.assertEqual(org.comment, "grid id not available") + org.add_comment("crunchbase id not available") + self.assertEqual(org.comment, "crunchbase id not available") other_org = aggregate_organizations.Organization(2, "test_2") other_org.add_comment("") self.assertEqual(other_org.comment, None) diff --git a/company_linkage/push_to_airflow.sh b/company_linkage/push_to_airflow.sh index 7d3813b7..675430e0 100755 --- a/company_linkage/push_to_airflow.sh +++ b/company_linkage/push_to_airflow.sh @@ -15,4 +15,6 @@ gsutil cp sequences/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sequences/p gsutil rm gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/parat/* gsutil cp sql/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/parat/ gsutil cp schemas/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/schemas/parat/ +gsutil rm -r gs://airflow-data-exchange/parat/schemas/* +gsutil cp schemas/* gs://airflow-data-exchange/parat/schemas/ gsutil -m cp -r parat_scripts/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/parat_scripts/ \ No newline at end of file diff --git a/company_linkage/schemas/aggregated_organizations.json b/company_linkage/schemas/aggregated_organizations.json index 40cf0ef4..22aa3e28 100644 --- a/company_linkage/schemas/aggregated_organizations.json +++ b/company_linkage/schemas/aggregated_organizations.json @@ -157,9 +157,9 @@ }, { "mode": "REPEATED", - "name": "grid", + "name": "ror_id", "type": "STRING", - "description": "The company's GRID identifier." + "description": "The company's ROR identifier." }, { "mode": "REPEATED", diff --git a/company_linkage/sequences/initial_data.csv b/company_linkage/sequences/initial_data.csv index a6e65b82..c1a10432 100644 --- a/company_linkage/sequences/initial_data.csv +++ b/company_linkage/sequences/initial_data.csv @@ -1,6 +1,6 @@ high_resolution_entities,organizations -ai_companies_visualization,ai_publications -ai_companies_visualization,linked_ai_patents -ai_companies_visualization,top_conference_pubs -ai_companies_visualization,pubs_in_top_conferences -ai_companies_visualization,all_publications \ No newline at end of file +staging_ai_companies_visualization,ai_publications +staging_ai_companies_visualization,linked_ai_patents +staging_ai_companies_visualization,top_conference_pubs +staging_ai_companies_visualization,pubs_in_top_conferences +staging_ai_companies_visualization,all_publications \ No newline at end of file diff --git a/company_linkage/sequences/visualization_data.csv b/company_linkage/sequences/visualization_data.csv new file mode 100644 index 00000000..49ce1dd8 --- /dev/null +++ b/company_linkage/sequences/visualization_data.csv @@ -0,0 +1 @@ +staging_ai_companies_visualization,initial_visualization_data diff --git a/company_linkage/sql/all_publications.sql b/company_linkage/sql/all_publications.sql index 6631575f..e11ef86f 100644 --- a/company_linkage/sql/all_publications.sql +++ b/company_linkage/sql/all_publications.sql @@ -1,6 +1,4 @@ - -- Pulling every publication id linked to every author affiliate and all years because we'll want those later for yearly counts -CREATE OR REPLACE TABLE - ai_companies_visualization.all_publications AS + -- Pulling every publication id linked to every author affiliate and all years because we'll want those later for yearly count WITH ror AS ( -- Adding in org names and country data using ROR diff --git a/company_linkage/sql/pubs_in_top_conferences.sql b/company_linkage/sql/pubs_in_top_conferences.sql index cc1bee22..3b23ee19 100644 --- a/company_linkage/sql/pubs_in_top_conferences.sql +++ b/company_linkage/sql/pubs_in_top_conferences.sql @@ -27,7 +27,7 @@ SELECT ror_id, year FROM - ai_companies_visualization.top_conference_pubs AS top_pubs + staging_ai_companies_visualization.top_conference_pubs AS top_pubs -- We're inner joining because if there's no affiliate information at all we have no way to even evaluate this data for our purposes INNER JOIN affils From cf25b26ddae88a152b1f9d344f9a9ab7b089bd38 Mon Sep 17 00:00:00 2001 From: Rebecca Date: Tue, 10 Oct 2023 16:02:16 -0400 Subject: [PATCH 04/17] Refactor names for idempotence; create sequences file --- company_linkage/README.md | 38 +++++++++---------- company_linkage/parat_data_dag.py | 4 +- .../sequences/visualization_data.csv | 18 +++++++++ .../sql/adding_paper_patent_data.sql | 34 ----------------- ...l => initial_paper_visualization_data.sql} | 2 - ... => initial_patent_visualization_data.sql} | 2 - ...ons.sql => initial_visualization_data.sql} | 0 ... initial_workforce_visualization_data.sql} | 0 company_linkage/sql/merged_ai_papers.sql | 13 ------- company_linkage/sql/omitting_companies.sql | 15 -------- ...apers.sql => paper_visualization_data.sql} | 0 ...aper_visualization_data_with_clusters.sql} | 0 ...lization_data_with_company_references.sql} | 0 ... => paper_visualization_data_with_mag.sql} | 0 ...paper_visualization_data_with_methods.sql} | 0 ...> paper_visualization_data_with_tasks.sql} | 0 ...ents.sql => patent_visualization_data.sql} | 0 ...atent_visualization_data_with_by_year.sql} | 0 ...ny_metadata.sql => visualization_data.sql} | 0 ...ql => visualization_data_omit_by_rule.sql} | 0 ...=> visualization_data_with_all_papers.sql} | 0 ...ql => visualization_data_with_by_year.sql} | 0 ...=> visualization_data_with_top_papers.sql} | 0 ...e.sql => workforce_visualization_data.sql} | 0 ...force_visualization_data_with_ai_jobs.sql} | 0 25 files changed, 39 insertions(+), 87 deletions(-) delete mode 100644 company_linkage/sql/adding_paper_patent_data.sql rename company_linkage/sql/{creating_paper_visualization_data.sql => initial_paper_visualization_data.sql} (90%) rename company_linkage/sql/{creating_patent_visualization_data.sql => initial_patent_visualization_data.sql} (98%) rename company_linkage/sql/{creating_initial_visualization_data_publications.sql => initial_visualization_data.sql} (100%) rename company_linkage/sql/{creating_workforce_visualization_data.sql => initial_workforce_visualization_data.sql} (100%) delete mode 100644 company_linkage/sql/merged_ai_papers.sql delete mode 100644 company_linkage/sql/omitting_companies.sql rename company_linkage/sql/{omit_by_rule_papers.sql => paper_visualization_data.sql} (100%) rename company_linkage/sql/{adding_top_science_map_clusters.sql => paper_visualization_data_with_clusters.sql} (100%) rename company_linkage/sql/{adding_company_references.sql => paper_visualization_data_with_company_references.sql} (100%) rename company_linkage/sql/{adding_top_mag_ai_fields.sql => paper_visualization_data_with_mag.sql} (100%) rename company_linkage/sql/{adding_top_methods.sql => paper_visualization_data_with_methods.sql} (100%) rename company_linkage/sql/{adding_top_tasks.sql => paper_visualization_data_with_tasks.sql} (100%) rename company_linkage/sql/{omit_by_rule_patents.sql => patent_visualization_data.sql} (100%) rename company_linkage/sql/{adding_ai_patents_by_year_to_visualization.sql => patent_visualization_data_with_by_year.sql} (100%) rename company_linkage/sql/{adding_crunchbase_company_metadata.sql => visualization_data.sql} (100%) rename company_linkage/sql/{omit_by_rule.sql => visualization_data_omit_by_rule.sql} (100%) rename company_linkage/sql/{adding_all_paper_counts.sql => visualization_data_with_all_papers.sql} (100%) rename company_linkage/sql/{adding_ai_pubs_by_year_to_visualization.sql => visualization_data_with_by_year.sql} (100%) rename company_linkage/sql/{adding_top_paper_counts.sql => visualization_data_with_top_papers.sql} (100%) rename company_linkage/sql/{omit_by_rule_workforce.sql => workforce_visualization_data.sql} (100%) rename company_linkage/sql/{adding_ai_jobs_to_workforce_visualization.sql => workforce_visualization_data_with_ai_jobs.sql} (100%) diff --git a/company_linkage/README.md b/company_linkage/README.md index 47438633..e3248a3b 100644 --- a/company_linkage/README.md +++ b/company_linkage/README.md @@ -31,25 +31,25 @@ run some of this code as-is. 13. Upload `top_paper_counts.jsonl` to `ai_companies_visualization.top_paper_counts` using the [top_papers_schema](schemas/top_papers_schema.json) 14. `python3 all_papers.py all_paper_counts.jsonl` 15. Upload `all_paper_counts.jsonl` to `ai_companies_visualization.total_paper_counts` using the [all_papers_schema](schemas/all_papers_schema.json) -16. [creating_initial_visualization_data_publications.sql](sql/creating_initial_visualization_data_publications.sql) -17. [adding_ai_pubs_by_year_to_visualization.sql](sql/adding_ai_pubs_by_year_to_visualization.sql) -18. [creating_patent_visualization_data.sql](sql/creating_patent_visualization_data.sql) -19. [adding_ai_patents_by_year_to_visualization.sql](sql/adding_ai_patents_by_year_to_visualization.sql) -20. [creating_paper_visualization_data.sql](sql/creating_paper_visualization_data.sql) -21. [adding_top_mag_ai_fields.sql](sql/adding_top_mag_ai_fields.sql) -22. [adding_top_science_map_clusters.sql](sql/adding_top_science_map_clusters.sql) -23. [adding_company_references.sql](sql/adding_company_references.sql) -24. [adding_top_tasks.sql](sql/adding_top_tasks.sql) -25. [adding_top_methods.sql](sql/adding_top_methods.sql) -26. [adding_top_paper_counts.sql](sql/adding_top_paper_counts.sql) -27. [adding_all_paper_counts.sql](sql/adding_all_paper_counts.sql) -28. [creating_workforce_visualization_data.sql](sql/creating_workforce_visualization_data.sql) -29. [adding_ai_jobs_to_workforce_visualization.sql](sql/adding_ai_jobs_to_workforce_visualization.sql) -30. [omit_by_rule.sql](sql/omit_by_rule.sql) -31. [omit_by_rule_papers.sql](sql/omit_by_rule_papers.sql) -32. [omit_by_rule_patents.sql](sql/omit_by_rule_patents.sql) -33. [omit_by_rule_workforce.sql](sql/omit_by_rule_workforce.sql) -34. [adding_crunchbase_company_metadata.sql](sql/adding_crunchbase_company_metadata.sql) +16. [initial_visualization_data.sql](sql/initial_visualization_data.sql) +17. [visualization_data_with_by_year.sql](sql/visualization_data_with_by_year.sql) +18. [initial_patent_visualization_data.sql](sql/initial_patent_visualization_data.sql) +19. [patent_visualization_data_with_by_year.sql](sql/patent_visualization_data_with_by_year.sql) +20. [initial_paper_visualization_data.sql](sql/initial_paper_visualization_data.sql) +21. [paper_visualization_data_with_mag.sql](sql/paper_visualization_data_with_mag.sql) +22. [paper_visualization_data_with_clusters.sql](sql/paper_visualization_data_with_clusters.sql) +23. [paper_visualization_data_with_company_references.sql](sql/paper_visualization_data_with_company_references.sql) +24. [paper_visualization_data_with_tasks.sql](sql/paper_visualization_data_with_tasks.sql) +25. [paper_visualization_data_with_methods.sql](sql/paper_visualization_data_with_methods.sql) +26. [visualization_data_with_top_papers.sql](sql/visualization_data_with_top_papers.sql) +27. [visualization_data_with_all_papers.sql](sql/visualization_data_with_all_papers.sql) +28. [initial_workforce_visualization_data.sql](sql/initial_workforce_visualization_data.sql) +29. [workforce_visualization_data_with_ai_jobs.sql](sql/workforce_visualization_data_with_ai_jobs.sql) +30. [visualization_data_omit_by_rule.sql](sql/visualization_data_omit_by_rule.sql) +31. [paper_visualization_data.sql](sql/paper_visualization_data.sql) +32. [patent_visualization_data.sql](sql/patent_visualization_data.sql) +33. [workforce_visualization_data.sql](sql/workforce_visualization_data.sql) +34. [visualization_data.sql](sql/visualization_data.sql) # Deployment diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py index 2416ff86..029a0ec3 100644 --- a/company_linkage/parat_data_dag.py +++ b/company_linkage/parat_data_dag.py @@ -160,7 +160,7 @@ cmds=["/bin/bash"], arguments=["-c", (f"echo 'getting AI counts!' ; rm -r ai || true ; " f"mkdir -p ai && " - f"python3 get_ai_counts.py ai/ai_company_papers.jsonl ai/ai_company_patents.jsonl" + f"python3 get_ai_counts.py ai/ai_company_papers.jsonl ai/ai_company_patents.jsonl && " f"gsutil -m cp -r ai gs://{DATA_BUCKET}/{tmp_dir}/ ")], namespace="default", image=f"us.gcr.io/{PROJECT_ID}/parat", @@ -218,7 +218,7 @@ cmds=["/bin/bash"], arguments=["-c", (f"echo 'getting {paper_type} paper counts!' ; rm -r {paper_type} || true ; " f"mkdir -p {paper_type} && " - f"python3 {paper_type}_papers.py {paper_type}/{paper_type}_paper_counts.jsonl" + f"python3 {paper_type}_papers.py {paper_type}/{paper_type}_paper_counts.jsonl && " f"gsutil -m cp -r {paper_type} gs://{DATA_BUCKET}/{tmp_dir}/ ")], namespace="default", image=f"us.gcr.io/{PROJECT_ID}/parat", diff --git a/company_linkage/sequences/visualization_data.csv b/company_linkage/sequences/visualization_data.csv index 49ce1dd8..f6a24560 100644 --- a/company_linkage/sequences/visualization_data.csv +++ b/company_linkage/sequences/visualization_data.csv @@ -1 +1,19 @@ staging_ai_companies_visualization,initial_visualization_data +staging_ai_companies_visualization,visualization_data_with_by_year +staging_ai_companies_visualization,visualization_data_with_top_papers +staging_ai_companies_visualization,visualization_data_with_all_papers +staging_ai_companies_visualization,visualization_data_omit_by_rule +staging_ai_companies_visualization,visualization_data +staging_ai_companies_visualization,initial_patent_visualization_data +staging_ai_companies_visualization,patent_visualization_data_with_by_year +staging_ai_companies_visualization,patent_visualization_data +staging_ai_companies_visualization,initial_paper_visualization_data +staging_ai_companies_visualization,paper_visualization_data_with_mag +staging_ai_companies_visualization,paper_visualization_data_with_clusters +staging_ai_companies_visualization,paper_visualization_data_with_company_references +staging_ai_companies_visualization,paper_visualization_data_with_tasks +staging_ai_companies_visualization,paper_visualization_data_with_methods +staging_ai_companies_visualization,paper_visualization_data +staging_ai_companies_visualization,initial_workforce_visualization_data +staging_ai_companies_visualization,workforce_visualization_data_with_ai_jobs +staging_ai_companies_visualization,workforce_visualization_data \ No newline at end of file diff --git a/company_linkage/sql/adding_paper_patent_data.sql b/company_linkage/sql/adding_paper_patent_data.sql deleted file mode 100644 index b075c7c7..00000000 --- a/company_linkage/sql/adding_paper_patent_data.sql +++ /dev/null @@ -1,34 +0,0 @@ --- DEPRECATED, REMOVE SOON --- Update the visualization table itself to add paper and patent data -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS - -- Pull in the paper and patent counts, along with the CSET ids to link them in -WITH - count_data AS ( - SELECT - CSET_id, - ai_pubs, - ai_pubs_by_year, - ai_patents, - ai_patents_by_year - FROM - `gcp-cset-projects.ai_companies_visualization.paper_patent_counts`), - -- Pull in the current visualization data. Exclude the ai_pubs data, since that was included when we built the paper/patent data, so we don't need it - viz_data AS ( - SELECT - * EXCEPT(ai_pubs, ai_pubs_by_year) - FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data`) - -- Join the two together using the CSET id -SELECT - viz_data.*, - ai_pubs, - ai_pubs_by_year, - ai_patents, - ai_patents_by_year -FROM - viz_data -LEFT JOIN - count_data -ON - viz_data.CSET_id = count_data.CSET_id \ No newline at end of file diff --git a/company_linkage/sql/creating_paper_visualization_data.sql b/company_linkage/sql/initial_paper_visualization_data.sql similarity index 90% rename from company_linkage/sql/creating_paper_visualization_data.sql rename to company_linkage/sql/initial_paper_visualization_data.sql index 5aacbb79..ba20b588 100644 --- a/company_linkage/sql/creating_paper_visualization_data.sql +++ b/company_linkage/sql/initial_paper_visualization_data.sql @@ -1,5 +1,3 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.paper_visualization_data AS WITH get_citations AS ( SELECT diff --git a/company_linkage/sql/creating_patent_visualization_data.sql b/company_linkage/sql/initial_patent_visualization_data.sql similarity index 98% rename from company_linkage/sql/creating_patent_visualization_data.sql rename to company_linkage/sql/initial_patent_visualization_data.sql index 134c75b8..7720057f 100644 --- a/company_linkage/sql/creating_patent_visualization_data.sql +++ b/company_linkage/sql/initial_patent_visualization_data.sql @@ -1,5 +1,3 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.patent_visualization_data AS WITH aipats AS ( -- Pulling all the patents from any of our companies diff --git a/company_linkage/sql/creating_initial_visualization_data_publications.sql b/company_linkage/sql/initial_visualization_data.sql similarity index 100% rename from company_linkage/sql/creating_initial_visualization_data_publications.sql rename to company_linkage/sql/initial_visualization_data.sql diff --git a/company_linkage/sql/creating_workforce_visualization_data.sql b/company_linkage/sql/initial_workforce_visualization_data.sql similarity index 100% rename from company_linkage/sql/creating_workforce_visualization_data.sql rename to company_linkage/sql/initial_workforce_visualization_data.sql diff --git a/company_linkage/sql/merged_ai_papers.sql b/company_linkage/sql/merged_ai_papers.sql deleted file mode 100644 index 9f3478f2..00000000 --- a/company_linkage/sql/merged_ai_papers.sql +++ /dev/null @@ -1,13 +0,0 @@ -CREATE OR REPLACE TABLE - `gcp-cset-projects.ai_companies_visualization.ai_company_pubs` AS -SELECT - DISTINCT * -FROM - `gcp-cset-projects.ai_companies_visualization.ai_company_pubs` -UNION DISTINCT -SELECT - DISTINCT * -FROM - `gcp-cset-projects.ai_companies_visualization.ai_company_pubs_no_grid` -ORDER BY - id \ No newline at end of file diff --git a/company_linkage/sql/omitting_companies.sql b/company_linkage/sql/omitting_companies.sql deleted file mode 100644 index 1d5d3cef..00000000 --- a/company_linkage/sql/omitting_companies.sql +++ /dev/null @@ -1,15 +0,0 @@ --- DEPRECATED, REMOVE WHEN READY --- We want to omit companies from the visualization -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS -SELECT - * -FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data` - -- Omitting companies based on list -WHERE - CSET_id NOT IN ( - SELECT - * - FROM - ai_companies_visualization.omit) \ No newline at end of file diff --git a/company_linkage/sql/omit_by_rule_papers.sql b/company_linkage/sql/paper_visualization_data.sql similarity index 100% rename from company_linkage/sql/omit_by_rule_papers.sql rename to company_linkage/sql/paper_visualization_data.sql diff --git a/company_linkage/sql/adding_top_science_map_clusters.sql b/company_linkage/sql/paper_visualization_data_with_clusters.sql similarity index 100% rename from company_linkage/sql/adding_top_science_map_clusters.sql rename to company_linkage/sql/paper_visualization_data_with_clusters.sql diff --git a/company_linkage/sql/adding_company_references.sql b/company_linkage/sql/paper_visualization_data_with_company_references.sql similarity index 100% rename from company_linkage/sql/adding_company_references.sql rename to company_linkage/sql/paper_visualization_data_with_company_references.sql diff --git a/company_linkage/sql/adding_top_mag_ai_fields.sql b/company_linkage/sql/paper_visualization_data_with_mag.sql similarity index 100% rename from company_linkage/sql/adding_top_mag_ai_fields.sql rename to company_linkage/sql/paper_visualization_data_with_mag.sql diff --git a/company_linkage/sql/adding_top_methods.sql b/company_linkage/sql/paper_visualization_data_with_methods.sql similarity index 100% rename from company_linkage/sql/adding_top_methods.sql rename to company_linkage/sql/paper_visualization_data_with_methods.sql diff --git a/company_linkage/sql/adding_top_tasks.sql b/company_linkage/sql/paper_visualization_data_with_tasks.sql similarity index 100% rename from company_linkage/sql/adding_top_tasks.sql rename to company_linkage/sql/paper_visualization_data_with_tasks.sql diff --git a/company_linkage/sql/omit_by_rule_patents.sql b/company_linkage/sql/patent_visualization_data.sql similarity index 100% rename from company_linkage/sql/omit_by_rule_patents.sql rename to company_linkage/sql/patent_visualization_data.sql diff --git a/company_linkage/sql/adding_ai_patents_by_year_to_visualization.sql b/company_linkage/sql/patent_visualization_data_with_by_year.sql similarity index 100% rename from company_linkage/sql/adding_ai_patents_by_year_to_visualization.sql rename to company_linkage/sql/patent_visualization_data_with_by_year.sql diff --git a/company_linkage/sql/adding_crunchbase_company_metadata.sql b/company_linkage/sql/visualization_data.sql similarity index 100% rename from company_linkage/sql/adding_crunchbase_company_metadata.sql rename to company_linkage/sql/visualization_data.sql diff --git a/company_linkage/sql/omit_by_rule.sql b/company_linkage/sql/visualization_data_omit_by_rule.sql similarity index 100% rename from company_linkage/sql/omit_by_rule.sql rename to company_linkage/sql/visualization_data_omit_by_rule.sql diff --git a/company_linkage/sql/adding_all_paper_counts.sql b/company_linkage/sql/visualization_data_with_all_papers.sql similarity index 100% rename from company_linkage/sql/adding_all_paper_counts.sql rename to company_linkage/sql/visualization_data_with_all_papers.sql diff --git a/company_linkage/sql/adding_ai_pubs_by_year_to_visualization.sql b/company_linkage/sql/visualization_data_with_by_year.sql similarity index 100% rename from company_linkage/sql/adding_ai_pubs_by_year_to_visualization.sql rename to company_linkage/sql/visualization_data_with_by_year.sql diff --git a/company_linkage/sql/adding_top_paper_counts.sql b/company_linkage/sql/visualization_data_with_top_papers.sql similarity index 100% rename from company_linkage/sql/adding_top_paper_counts.sql rename to company_linkage/sql/visualization_data_with_top_papers.sql diff --git a/company_linkage/sql/omit_by_rule_workforce.sql b/company_linkage/sql/workforce_visualization_data.sql similarity index 100% rename from company_linkage/sql/omit_by_rule_workforce.sql rename to company_linkage/sql/workforce_visualization_data.sql diff --git a/company_linkage/sql/adding_ai_jobs_to_workforce_visualization.sql b/company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql similarity index 100% rename from company_linkage/sql/adding_ai_jobs_to_workforce_visualization.sql rename to company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql From 4694433532e94d71beddee7ec75e25917e10c877 Mon Sep 17 00:00:00 2001 From: Rebecca Date: Tue, 10 Oct 2023 16:55:50 -0400 Subject: [PATCH 05/17] Switch AI count scripts to ROR; update workforce data --- company_linkage/parat_scripts/all_papers.py | 2 +- .../parat_scripts/get_ai_counts.py | 50 ++++++++++--------- .../parat_scripts/test_ai_counts.py | 12 ++--- company_linkage/parat_scripts/top_papers.py | 2 +- .../initial_workforce_visualization_data.sql | 16 +++--- 5 files changed, 42 insertions(+), 40 deletions(-) diff --git a/company_linkage/parat_scripts/all_papers.py b/company_linkage/parat_scripts/all_papers.py index 9fe5d534..f9234069 100644 --- a/company_linkage/parat_scripts/all_papers.py +++ b/company_linkage/parat_scripts/all_papers.py @@ -18,7 +18,7 @@ def main() -> None: paper_finder.get_identifiers() # These are the only two lines that make this different from running AI pubs # We select from a different table - table_name = "ai_companies_visualization.all_publications" + table_name = "staging_ai_companies_visualization.all_publications" # And we write out our data to a different variable companies = paper_finder.run_query_papers(table_name, "all_pubs", by_year=True) paper_finder.write_output(companies, args.output_file) diff --git a/company_linkage/parat_scripts/get_ai_counts.py b/company_linkage/parat_scripts/get_ai_counts.py index 504099de..681f704d 100644 --- a/company_linkage/parat_scripts/get_ai_counts.py +++ b/company_linkage/parat_scripts/get_ai_counts.py @@ -14,7 +14,7 @@ def __init__(self) -> None: AI papers in top conferences, etc.) and AI patents (from Dimensions and 1790 jointly). """ self.regex_dict = defaultdict(list) - self.grid_dict = defaultdict(list) + self.ror_dict = defaultdict(list) self.cset_ids = [] self.company_ids = [] self.patent_fields = ["Physical_Sciences_and_Engineering", @@ -58,7 +58,7 @@ def get_identifiers(self) -> None: Pulling the regular expressions used to find papers and patents through means other than GRID. :return: """ - regex_query = """SELECT CSET_id, regex, grid FROM + regex_query = """SELECT CSET_id, regex, ror_id FROM `gcp-cset-projects.high_resolution_entities.aggregated_organizations`""" client = bigquery.Client() query_job = client.query(regex_query) @@ -67,15 +67,15 @@ def get_identifiers(self) -> None: if result.regex: for regex in result.regex: self.regex_dict[result.CSET_id].append(regex) - if result.grid: - for grid_id in result.grid: - self.grid_dict[result.CSET_id].append(grid_id) + if result.ror_id: + for ror in result.ror_id: + self.ror_dict[result.CSET_id].append(ror) self.cset_ids.append(result.CSET_id) def run_query_papers(self, table_name: str, field_name: str, test: bool = False, by_year: bool = False) -> list: """ - Running a query to find paper counts using regex for papers missing GRID. This query combines - this data with preexisting paper counts already identified using SQL for papers that have GRID. + Running a query to find paper counts using regex for papers missing ROR. This query combines + this data with preexisting paper counts already identified using SQL for papers that have ROR. We no longer use this query for AI papers, but it is still used for top conference papers and total papers. :param table_name: The table to look for papers in @@ -100,8 +100,8 @@ def run_query_papers(self, table_name: str, field_name: str, test: bool = False, if len(regexes) > 1: for regex in regexes[1:]: query += f"""OR regexp_contains(org_name, r'(?i){regex}') """ - if cset_id in self.grid_dict: - query += f"""OR grid_id IN ({str(self.grid_dict[cset_id])[1:-1]})""" + if cset_id in self.ror_dict: + query += f"""OR ror_id IN ({str(self.ror_dict[cset_id])[1:-1]})""" query_job = client.query(query) # query_job is an iterator, so even though we're only returning one row we're going to loop for element in query_job: @@ -109,7 +109,7 @@ def run_query_papers(self, table_name: str, field_name: str, test: bool = False, # if we don't have total data, we won't have by_year either if by_year: row_dict[field_name_by_year] = self.run_query_papers_by_year(table_name, field_name, regexes, - self.grid_dict[cset_id]) + self.ror_dict[cset_id]) if not row_dict[field_name]: # if we end up without any papers, set that to be true row_dict[field_name] = 0 @@ -119,7 +119,7 @@ def run_query_papers(self, table_name: str, field_name: str, test: bool = False, companies.append(row_dict) return companies - def run_query_papers_by_year(self, table_name: str, field_name: str, regexes: list, grids: list) -> list: + def run_query_papers_by_year(self, table_name: str, field_name: str, regexes: list, rors: list) -> list: """ Getting the same paper count data, except split by year. We no longer use this query for AI papers, but it is still used for top conference papers and @@ -127,7 +127,7 @@ def run_query_papers_by_year(self, table_name: str, field_name: str, regexes: li :param table_name: The table to look for papers in :param field_name: The json field name :param regexes: The regexes for whichever CSET_id we're searching for - :param grids: The grids for whichever CSET_id we're searching for if they exist; otherwise an empty list + :param rors: The rors for whichever CSET_id we're searching for if they exist; otherwise an empty list :return: """ field_name_by_year = f"{field_name}_by_year" @@ -143,8 +143,8 @@ def run_query_papers_by_year(self, table_name: str, field_name: str, regexes: li for regex in regexes[1:]: # regex_to_use = rf"r'(?i){regex}'" query += f"""OR regexp_contains(org_name, r'(?i){regex}') """ - if grids: - query += f"""OR grid_id IN ({str(grids)[1:-1]}) """ + if rors: + query += f"""OR ror_id IN ({str(rors)[1:-1]}) """ query += """GROUP BY year ORDER BY year""" client = bigquery.Client() query_job = client.query(query) @@ -160,7 +160,7 @@ def run_query_id_papers(self, table_name: str, test: bool = False) -> list: :param test: False if not running as a unit test :return: """ - companies_query = f"""SELECT CSET_id, grid FROM + companies_query = f"""SELECT CSET_id, ror_id FROM `gcp-cset-projects.high_resolution_entities.aggregated_organizations`""" if test: companies_query += """ LIMIT 25""" @@ -180,9 +180,9 @@ def run_query_id_papers(self, table_name: str, test: bool = False) -> list: if len(regexes) > 1: for regex in regexes[1:]: query += f"""OR regexp_contains(org_name, r'(?i){regex}') """ - if row["grid"]: - self.grid_dict[row["CSET_id"]] = row["grid"] - query += f"""OR grid_id IN ({str(row["grid"])[1:-1]})""" + if row["ror_id"]: + self.ror_dict[row["CSET_id"]] = row["ror_id"] + query += f"""OR ror_id IN ({str(row["ror_id"])[1:-1]})""" query_job = client.query(query) # get all the merged ids for element in query_job: @@ -192,11 +192,15 @@ def run_query_id_papers(self, table_name: str, test: bool = False) -> list: return company_rows def run_query_id_patents(self): + """ + Get patent counts one by one using CSET_ids. + :return: + """ patent_companies = [] for cset_id in self.company_ids: if cset_id in self.regex_dict: regexes = self.regex_dict[cset_id] - grids = self.grid_dict[cset_id] + rors = self.ror_dict[cset_id] query = f"""SELECT DISTINCT family_id, priority_year, @@ -236,14 +240,14 @@ def run_query_id_patents(self): Machine_Learning, Search_Methods FROM - ai_companies_visualization.linked_ai_patents + staging_ai_companies_visualization.linked_ai_patents WHERE regexp_contains(assignee, r'(?i){regexes[0]}') """ # if we have more than one regex for an org, include all of them if len(regexes) > 1: for regex in regexes[1:]: query += f"""OR regexp_contains(assignee, r'(?i){regex}') """ - if grids: - query += f"""OR grid IN ({str(grids)[1:-1]})""" + if rors: + query += f"""OR ror_id IN ({str(rors)[1:-1]})""" client = bigquery.Client() query_job = client.query(query) for row in query_job: @@ -278,7 +282,7 @@ def main() -> None: count_getter = CountGetter() print("Fetching identifiers") count_getter.get_identifiers() - table_name = "gcp-cset-projects.ai_companies_visualization.ai_publications" + table_name = "gcp-cset-projects.staging_ai_companies_visualization.ai_publications" print("Fetching paper data") company_rows = count_getter.run_query_id_papers(table_name) print("Writing results") diff --git a/company_linkage/parat_scripts/test_ai_counts.py b/company_linkage/parat_scripts/test_ai_counts.py index 3845ffff..926a2731 100644 --- a/company_linkage/parat_scripts/test_ai_counts.py +++ b/company_linkage/parat_scripts/test_ai_counts.py @@ -22,7 +22,7 @@ def test_get_identifiers(self): count_getter.get_identifiers() # the dicts are populated self.assertGreater(len(count_getter.regex_dict), 0) - self.assertGreater(len(count_getter.grid_dict), 0) + self.assertGreater(len(count_getter.ror_dict), 0) self.assertGreater(len(count_getter.cset_ids), 0) self.assertEqual(type(count_getter.cset_ids), list) # the values in the dict are the correct type @@ -30,16 +30,16 @@ def test_get_identifiers(self): self.assertEqual(type(key_val), int) # we allow multiple regexes, so we have a list self.assertEqual(type(count_getter.regex_dict[key_val]), list) - for key_val in count_getter.grid_dict.keys(): + for key_val in count_getter.ror_dict.keys(): self.assertEqual(type(key_val), int) # we allow multiple regexes, so we have a list - self.assertEqual(type(count_getter.grid_dict[key_val]), list) + self.assertEqual(type(count_getter.ror_dict[key_val]), list) @ignore_warnings def test_run_query_papers(self): count_getter = CountGetter() count_getter.get_identifiers() - table_name = "gcp-cset-projects.ai_companies_visualization.ai_publications" + table_name = "gcp-cset-projects.staging_ai_companies_visualization.ai_publications" test = True companies = count_getter.run_query_papers(table_name, "ai_pubs", test=test, by_year=False) # Make sure we're setting the AI pubs for every company! @@ -67,7 +67,7 @@ def test_run_query_papers(self): def test_run_query_id_papers(self): count_getter = CountGetter() count_getter.get_identifiers() - table_name = "gcp-cset-projects.ai_companies_visualization.ai_publications" + table_name = "gcp-cset-projects.staging_ai_companies_visualization.ai_publications" test = True company_rows = count_getter.run_query_id_papers(table_name, test=test) for company_row in company_rows: @@ -84,7 +84,7 @@ def test_run_query_id_papers(self): def test_run_query_id_patents(self): count_getter = CountGetter() count_getter.get_identifiers() - table_name = "gcp-cset-projects.ai_companies_visualization.ai_publications" + table_name = "gcp-cset-projects.staging_ai_companies_visualization.ai_publications" test = True count_getter.run_query_id_papers(table_name, test) patent_companies = count_getter.run_query_id_patents() diff --git a/company_linkage/parat_scripts/top_papers.py b/company_linkage/parat_scripts/top_papers.py index a11da6af..bc1bbece 100644 --- a/company_linkage/parat_scripts/top_papers.py +++ b/company_linkage/parat_scripts/top_papers.py @@ -18,7 +18,7 @@ def main() -> None: paper_finder.get_identifiers() # These are the only two lines that make this different from running AI pubs # We select from a different table - table_name = "ai_companies_visualization.pubs_in_top_conferences" + table_name = "staging_ai_companies_visualization.pubs_in_top_conferences" # And we write out our data to a different variable companies = paper_finder.run_query_papers(table_name, "ai_pubs_in_top_conferences", by_year=True) paper_finder.write_output(companies, args.output_file) diff --git a/company_linkage/sql/initial_workforce_visualization_data.sql b/company_linkage/sql/initial_workforce_visualization_data.sql index 70620d57..1c0f9be9 100644 --- a/company_linkage/sql/initial_workforce_visualization_data.sql +++ b/company_linkage/sql/initial_workforce_visualization_data.sql @@ -1,11 +1,9 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.workforce_visualization_data AS WITH clean_linkedins AS ( SELECT DISTINCT cset_id, name, - REPLACE(linkedins, "https://www.", "http://") AS linkedin + REPLACE(REPLACE(linkedins, "https://www.", ""), "http://www.", "") AS linkedin FROM high_resolution_entities.aggregated_organizations CROSS JOIN @@ -16,11 +14,11 @@ SELECT FROM clean_linkedins LEFT JOIN - `gcp-cset-projects.gcp_cset_revelio.position` position + revelio.individual_position ON - linkedin = company_li_url + linkedin = company_linkedin_url INNER JOIN - gcp_cset_revelio.role_lookup + revelio.role_lookup USING (mapped_role) INNER JOIN @@ -28,12 +26,12 @@ INNER JOIN ON (k1000 = role_k1000) LEFT JOIN - gcp_cset_revelio.education + revelio.individual_education USING (user_id) WHERE - (position.enddate IS NULL - OR position.enddate > CURRENT_DATE ()) + (individual_position.enddate IS NULL + OR individual_position.enddate > CURRENT_DATE ()) AND (ba_req IS FALSE OR ((degree = "Bachelor" OR degree = "Master" From 1f88998c0b038d36eaa535c9f12580fcf1e3a5ad Mon Sep 17 00:00:00 2001 From: Rebecca Date: Thu, 12 Oct 2023 13:39:32 -0400 Subject: [PATCH 06/17] Add visualization data to DAG; update initial tables --- company_linkage/parat_data_dag.py | 41 +++++++++++++++---- .../sequences/visualization_data.csv | 8 ++-- .../sql/initial_visualization_data.sql | 16 +++----- .../visualization_data_with_all_papers.sql | 11 ++--- .../sql/visualization_data_with_by_year.sql | 16 ++++---- .../visualization_data_with_top_papers.sql | 8 ++-- 6 files changed, 56 insertions(+), 44 deletions(-) diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py index 029a0ec3..0bde5f58 100644 --- a/company_linkage/parat_data_dag.py +++ b/company_linkage/parat_data_dag.py @@ -146,11 +146,6 @@ write_disposition="WRITE_TRUNCATE" ) - # TODO: somewhere in here we need to decide whether to load directly to the main table - # or to add a transfer step to transfer from staging to the main table; if the latter - # are there checks we want to add first? - # for now, pretend the data is in the main table already - run_get_ai_counts = GKEStartPodOperator( task_id="run_get_ai_counts", project_id=PROJECT_ID, @@ -187,7 +182,7 @@ load_ai_papers = GCSToBigQueryOperator( task_id=f"load_ai_company_papers", bucket=DATA_BUCKET, - source_objects=[f"{tmp_dir}/ai_company_papers.jsonl"], + source_objects=[f"{tmp_dir}/ai/ai_company_papers.jsonl"], schema_object=f"{schema_dir}/ai_papers_schema.json", destination_project_dataset_table=f"{staging_dataset}.ai_company_papers", source_format="NEWLINE_DELIMITED_JSON", @@ -198,7 +193,7 @@ load_ai_patents = GCSToBigQueryOperator( task_id=f"load_ai_company_patents", bucket=DATA_BUCKET, - source_objects=[f"{tmp_dir}/ai_company_patents.jsonl"], + source_objects=[f"{tmp_dir}/ai/ai_company_patents.jsonl"], schema_object=f"{schema_dir}/ai_patents_schema.json", destination_project_dataset_table=f"{staging_dataset}.ai_company_patents", source_format="NEWLINE_DELIMITED_JSON", @@ -249,7 +244,7 @@ load_top_papers = GCSToBigQueryOperator( task_id=f"load_top_papers", bucket=DATA_BUCKET, - source_objects=[f"{tmp_dir}/top_paper_counts.jsonl"], + source_objects=[f"{tmp_dir}/top/top_paper_counts.jsonl"], schema_object=f"{schema_dir}/top_papers_schema.json", destination_project_dataset_table=f"{staging_dataset}.top_paper_counts", source_format="NEWLINE_DELIMITED_JSON", @@ -260,7 +255,7 @@ load_all_papers = GCSToBigQueryOperator( task_id=f"load_all_papers", bucket=DATA_BUCKET, - source_objects=[f"{tmp_dir}/all_paper_counts.jsonl"], + source_objects=[f"{tmp_dir}/all/all_paper_counts.jsonl"], schema_object=f"{schema_dir}/all_papers_schema.json", destination_project_dataset_table=f"{staging_dataset}.all_paper_counts", source_format="NEWLINE_DELIMITED_JSON", @@ -268,8 +263,35 @@ write_disposition="WRITE_TRUNCATE" ) + start_visualization_tables = DummyOperator(task_id="start_visualization_tables") + wait_for_visualization_tables = DummyOperator(task_id="wait_for_visualization_tables") + visualization_query_sequence = "visualization_data.csv" + curr = start_visualization_tables + for line in open(seq_path_prefix + visualization_query_sequence).readlines(): + dataset, table = line.split(",") + table_name = f"{dataset}.{table.strip()}" + next_tab = BigQueryInsertJobOperator( + task_id=f"create_{table_name}", + configuration={ + "query": { + "query": "{% include '" + f"{sql_dir}/{table.strip()}.sql" + "' %}", + "useLegacySql": False, + "destinationTable": { + "projectId": PROJECT_ID, + "datasetId": dataset, + "tableId": table + }, + "allowLargeResults": True, + "createDisposition": "CREATE_IF_NEEDED", + "writeDisposition": "WRITE_TRUNCATE" + } + }, + ) + curr >> next_tab + curr = next_tab + curr >> wait_for_visualization_tables ( @@ -288,5 +310,6 @@ >> run_papers >> load_top_papers >> load_all_papers + >> start_visualization_tables ) diff --git a/company_linkage/sequences/visualization_data.csv b/company_linkage/sequences/visualization_data.csv index f6a24560..09ae1630 100644 --- a/company_linkage/sequences/visualization_data.csv +++ b/company_linkage/sequences/visualization_data.csv @@ -2,18 +2,18 @@ staging_ai_companies_visualization,initial_visualization_data staging_ai_companies_visualization,visualization_data_with_by_year staging_ai_companies_visualization,visualization_data_with_top_papers staging_ai_companies_visualization,visualization_data_with_all_papers -staging_ai_companies_visualization,visualization_data_omit_by_rule -staging_ai_companies_visualization,visualization_data staging_ai_companies_visualization,initial_patent_visualization_data staging_ai_companies_visualization,patent_visualization_data_with_by_year -staging_ai_companies_visualization,patent_visualization_data staging_ai_companies_visualization,initial_paper_visualization_data staging_ai_companies_visualization,paper_visualization_data_with_mag staging_ai_companies_visualization,paper_visualization_data_with_clusters staging_ai_companies_visualization,paper_visualization_data_with_company_references staging_ai_companies_visualization,paper_visualization_data_with_tasks staging_ai_companies_visualization,paper_visualization_data_with_methods -staging_ai_companies_visualization,paper_visualization_data staging_ai_companies_visualization,initial_workforce_visualization_data staging_ai_companies_visualization,workforce_visualization_data_with_ai_jobs +staging_ai_companies_visualization,visualization_data_omit_by_rule +staging_ai_companies_visualization,visualization_data +staging_ai_companies_visualization,patent_visualization_data +staging_ai_companies_visualization,paper_visualization_data staging_ai_companies_visualization,workforce_visualization_data \ No newline at end of file diff --git a/company_linkage/sql/initial_visualization_data.sql b/company_linkage/sql/initial_visualization_data.sql index 64097b5d..c7eb2642 100644 --- a/company_linkage/sql/initial_visualization_data.sql +++ b/company_linkage/sql/initial_visualization_data.sql @@ -1,12 +1,8 @@ -- This query pulls the initial visualization data for the table that doesn't have to be compiled (as it's already -- available in the organizations table) and adds in the AI publication counts. - - -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS WITH aipubs AS ( - -- Pulling all the papers with any of the given GRIDs as affiliates + -- Pulling all the papers with any of the given RORs as affiliates SELECT CSET_id, merged_id, @@ -14,8 +10,8 @@ WITH nlp, robotics FROM - ai_companies_visualization.ai_company_pubs), - gridtable AS ( + staging_ai_companies_visualization.ai_company_papers), + rortable AS ( -- Getting the count of publications SELECT CSET_id, @@ -41,7 +37,7 @@ SELECT market, crunchbase, child_crunchbase, - grid, + ror_id, linkedin, in_sandp_500, in_fortune_global_500, @@ -50,8 +46,8 @@ SELECT COALESCE(nlp_pubs, 0) as nlp_pubs, COALESCE(robotics_pubs, 0) as robotics_pubs FROM - `gcp-cset-projects.high_resolution_entities.aggregated_organizations` AS orgs + high_resolution_entities.aggregated_organizations LEFT JOIN - gridtable + rortable USING (CSET_id) \ No newline at end of file diff --git a/company_linkage/sql/visualization_data_with_all_papers.sql b/company_linkage/sql/visualization_data_with_all_papers.sql index 073a6dce..67901809 100644 --- a/company_linkage/sql/visualization_data_with_all_papers.sql +++ b/company_linkage/sql/visualization_data_with_all_papers.sql @@ -1,6 +1,4 @@ -- Update the visualization table itself to add total paper data -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS -- Pull in the total paper counts, along with the CSET ids to link them in WITH count_data AS ( @@ -9,14 +7,13 @@ WITH all_pubs, all_pubs_by_year, FROM - `gcp-cset-projects.ai_companies_visualization.total_paper_counts`), - -- Pull in the current visualization data. Exclude the all_paper data, since that was included when we built the all paper data, so we don't need it + staging_ai_companies_visualization.all_paper_counts), + -- Pull in the current visualization data viz_data AS ( SELECT - * EXCEPT(all_pubs, - all_pubs_by_year) + * FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data`) + staging_ai_companies_visualization.visualization_data_with_top_papers) -- Join the two together using the CSET id SELECT viz_data.*, diff --git a/company_linkage/sql/visualization_data_with_by_year.sql b/company_linkage/sql/visualization_data_with_by_year.sql index 51db60ec..d229b93f 100644 --- a/company_linkage/sql/visualization_data_with_by_year.sql +++ b/company_linkage/sql/visualization_data_with_by_year.sql @@ -1,7 +1,5 @@ -- Adding AI publication data by year to the visualization table -- This uses the same mechanism as adding AI publication counts; we're just doing it on a by-year basis -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS WITH aipubs AS ( -- Pulling all the papers with any of the given GRIDs as affiliates @@ -13,8 +11,8 @@ WITH nlp, robotics FROM - ai_companies_visualization.ai_company_pubs), - gridtable AS ( + staging_ai_companies_visualization.ai_company_papers), + rortable AS ( -- Getting the count of publications SELECT CSET_id, @@ -49,23 +47,23 @@ WITH ORDER BY year) AS robotics_pubs_by_year, FROM - `gcp-cset-projects.high_resolution_entities.aggregated_organizations` AS orgs + high_resolution_entities.aggregated_organizations LEFT JOIN - gridtable + rortable USING (CSET_id) GROUP BY CSET_id) SELECT - viz.*, + initial_visualization_data.*, ai_pubs_by_year, cv_pubs_by_year, nlp_pubs_by_year, robotics_pubs_by_year FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data` AS viz + staging_ai_companies_visualization.initial_visualization_data LEFT JOIN by_year ON - viz.CSET_id = by_year.CSET_id + initial_visualization_data.CSET_id = by_year.CSET_id ORDER BY cset_id \ No newline at end of file diff --git a/company_linkage/sql/visualization_data_with_top_papers.sql b/company_linkage/sql/visualization_data_with_top_papers.sql index cf5e0d47..7277bd51 100644 --- a/company_linkage/sql/visualization_data_with_top_papers.sql +++ b/company_linkage/sql/visualization_data_with_top_papers.sql @@ -1,6 +1,4 @@ -- Update the visualization table itself to add top paper data -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS -- Pull in the top paper counts, along with the CSET ids to link them in WITH count_data AS ( @@ -9,13 +7,13 @@ WITH ai_pubs_in_top_conferences, ai_pubs_in_top_conferences_by_year, FROM - `gcp-cset-projects.ai_companies_visualization.top_paper_counts`), - -- Pull in the current visualization data. Exclude the ai_pubs_in_top_conferences data, since that was included when we built the top paper data, so we don't need it + staging_ai_companies_visualization.top_paper_counts), + -- Pull in the current visualization data. viz_data AS ( SELECT * FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data`) + staging_ai_companies_visualization.visualization_data_with_by_year) -- Join the two together using the CSET id SELECT viz_data.*, From d06ff09e12cece66eb044ebf7de245e8d9650416 Mon Sep 17 00:00:00 2001 From: Rebecca Date: Fri, 13 Oct 2023 11:13:06 -0400 Subject: [PATCH 07/17] Update paper visualization other than tasks+methods --- .../sql/initial_paper_visualization_data.sql | 10 +++++----- .../paper_visualization_data_with_clusters.sql | 10 ++++------ ...isualization_data_with_company_references.sql | 16 +++++++--------- .../sql/paper_visualization_data_with_mag.sql | 14 ++++++-------- 4 files changed, 22 insertions(+), 28 deletions(-) diff --git a/company_linkage/sql/initial_paper_visualization_data.sql b/company_linkage/sql/initial_paper_visualization_data.sql index ba20b588..57d83813 100644 --- a/company_linkage/sql/initial_paper_visualization_data.sql +++ b/company_linkage/sql/initial_paper_visualization_data.sql @@ -2,14 +2,14 @@ WITH get_citations AS ( SELECT DISTINCT CSET_id, - refs_merged.merged_id, + references.merged_id, ref_id FROM - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers LEFT JOIN - `gcp-cset-projects.gcp_cset_links_v2.paper_references_merged` refs_merged + literature.references ON - (ai_company_pubs.merged_id = ref_id)), + (ai_company_papers.merged_id = ref_id)), add_year AS ( SELECT DISTINCT CSET_id, @@ -19,7 +19,7 @@ WITH FROM get_citations LEFT JOIN - gcp_cset_links_v2.corpus_merged + literature.papers USING (merged_id) WHERE diff --git a/company_linkage/sql/paper_visualization_data_with_clusters.sql b/company_linkage/sql/paper_visualization_data_with_clusters.sql index 320490ea..637843b2 100644 --- a/company_linkage/sql/paper_visualization_data_with_clusters.sql +++ b/company_linkage/sql/paper_visualization_data_with_clusters.sql @@ -1,5 +1,3 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.paper_visualization_data AS WITH company_cluster_assignment AS ( SELECT @@ -7,9 +5,9 @@ WITH merged_id, cluster_id FROM - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers LEFT JOIN - `gcp-cset-projects.science_map_v2.dc5_cluster_assignment_stable` + map_of_science.cluster_assignment USING (merged_id) WHERE @@ -36,10 +34,10 @@ WITH GROUP BY CSET_id) SELECT - paper_visualization_data.*, + paper_visualization_data_with_mag.*, clusters FROM - ai_companies_visualization.paper_visualization_data + staging_ai_companies_visualization.paper_visualization_data_with_mag LEFT JOIN aggregated_clusters USING diff --git a/company_linkage/sql/paper_visualization_data_with_company_references.sql b/company_linkage/sql/paper_visualization_data_with_company_references.sql index 2b49b118..f935f350 100644 --- a/company_linkage/sql/paper_visualization_data_with_company_references.sql +++ b/company_linkage/sql/paper_visualization_data_with_company_references.sql @@ -1,5 +1,3 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.paper_visualization_data AS -- First get all the articles cited by the AI papers written by our companies WITH get_references AS ( @@ -8,9 +6,9 @@ WITH merged_id, ref_id FROM - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers LEFT JOIN - `gcp-cset-projects.gcp_cset_links_v2.paper_references_merged` + literature.references USING (merged_id)), referenced_companies AS ( @@ -18,13 +16,13 @@ WITH DISTINCT get_references.CSET_id, get_references.merged_id, ref_id, - ai_company_pubs.CSET_id AS ref_CSET_id + ai_company_papers.CSET_id AS ref_CSET_id FROM get_references INNER JOIN - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers ON - ref_id = ai_company_pubs.merged_id + ref_id = ai_company_papers.merged_id ORDER BY CSET_id), count_company_refs AS ( @@ -54,10 +52,10 @@ GROUP BY ORDER BY CSET_id) SELECT - paper_visualization_data.*, + paper_visualization_data_with_clusters.*, company_references FROM - ai_companies_visualization.paper_visualization_data + staging_ai_companies_visualization.paper_visualization_data_with_clusters LEFT JOIN aggregated_refs USING diff --git a/company_linkage/sql/paper_visualization_data_with_mag.sql b/company_linkage/sql/paper_visualization_data_with_mag.sql index 7223d747..46e93f76 100644 --- a/company_linkage/sql/paper_visualization_data_with_mag.sql +++ b/company_linkage/sql/paper_visualization_data_with_mag.sql @@ -1,19 +1,17 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.paper_visualization_data AS WITH names AS ( SELECT field_id AS child_field_id, name FROM - `gcp-cset-projects.fields_of_study.field_meta`), + fields_of_study.field_meta), ai_subfields AS ( SELECT field_id, child_field_id, name AS child_name FROM - `gcp-cset-projects.fields_of_study.field_children` + fields_of_study.field_children LEFT JOIN names USING @@ -43,7 +41,7 @@ WITH field.id AS field_id, field.name AS field_name FROM - `gcp-cset-projects.fields_of_study.top_fields` + fields_of_study.top_fields CROSS JOIN UNNEST(fields) AS field INNER JOIN @@ -59,7 +57,7 @@ WITH field_id, field_name FROM - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers LEFT JOIN articles_with_ai_subfields USING @@ -88,10 +86,10 @@ WITH GROUP BY CSET_id) SELECT - paper_visualization_data.*, + initial_paper_visualization_data.*, fields FROM - ai_companies_visualization.paper_visualization_data + staging_ai_companies_visualization.initial_paper_visualization_data LEFT JOIN aggregated_fields USING From 61d0e4680db9fc3007c5d40ddf9a7fe12d69cb20 Mon Sep 17 00:00:00 2001 From: Rebecca Date: Fri, 13 Oct 2023 11:21:29 -0400 Subject: [PATCH 08/17] Update patent visualization data --- company_linkage/data/omit.csv | 40 ------------------- .../sql/initial_patent_visualization_data.sql | 4 +- ...patent_visualization_data_with_by_year.sql | 8 ++-- 3 files changed, 5 insertions(+), 47 deletions(-) delete mode 100644 company_linkage/data/omit.csv diff --git a/company_linkage/data/omit.csv b/company_linkage/data/omit.csv deleted file mode 100644 index 5e432fc3..00000000 --- a/company_linkage/data/omit.csv +++ /dev/null @@ -1,40 +0,0 @@ -CSET_id -100 -296 -346 -374 -380 -386 -412 -418 -464 -467 -495 -612 -628 -633 -649 -724 -728 -756 -767 -2287 -2774 -2778 -2784 -2789 -2806 -2815 -2831 -2850 -2851 -2855 -2875 -2922 -2956 -2976 -2977 -2981 -2987 -3036 -3058 \ No newline at end of file diff --git a/company_linkage/sql/initial_patent_visualization_data.sql b/company_linkage/sql/initial_patent_visualization_data.sql index 7720057f..f5ef5a68 100644 --- a/company_linkage/sql/initial_patent_visualization_data.sql +++ b/company_linkage/sql/initial_patent_visualization_data.sql @@ -4,7 +4,7 @@ WITH SELECT * FROM - ai_companies_visualization.ai_company_patents), + staging_ai_companies_visualization.ai_company_patents), pattable AS ( -- Getting the count of patents SELECT @@ -90,7 +90,7 @@ SELECT COALESCE(Machine_Learning_pats, 0) as Machine_Learning_pats, COALESCE(Search_Methods_pats, 0) as Search_Methods_pats, FROM - `gcp-cset-projects.high_resolution_entities.aggregated_organizations` AS orgs + high_resolution_entities.aggregated_organizations LEFT JOIN pattable USING diff --git a/company_linkage/sql/patent_visualization_data_with_by_year.sql b/company_linkage/sql/patent_visualization_data_with_by_year.sql index 5804c8f9..06ed1457 100644 --- a/company_linkage/sql/patent_visualization_data_with_by_year.sql +++ b/company_linkage/sql/patent_visualization_data_with_by_year.sql @@ -1,12 +1,10 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.patent_visualization_data AS WITH aipats AS ( -- Pulling all the patents from any of our companies SELECT * FROM - ai_companies_visualization.ai_company_patents), + staging_ai_companies_visualization.ai_company_patents), pattable AS ( -- Getting the count of patents SELECT @@ -202,7 +200,7 @@ WITH priority_year) AS Search_Methods_pats_by_year, FROM - `gcp-cset-projects.high_resolution_entities.aggregated_organizations` AS orgs + high_resolution_entities.aggregated_organizations LEFT JOIN pattable USING @@ -215,7 +213,7 @@ SELECT viz.*, by_year.* EXCEPT (CSET_id) FROM - `gcp-cset-projects.ai_companies_visualization.patent_visualization_data` AS viz + staging_ai_companies_visualization.initial_patent_visualization_data AS viz LEFT JOIN by_year USING From acebb36357963469e76460b70662ab6f5005ee6a Mon Sep 17 00:00:00 2001 From: Rebecca Date: Fri, 13 Oct 2023 11:30:56 -0400 Subject: [PATCH 09/17] Update company omission SQL --- company_linkage/sql/paper_visualization_data.sql | 10 ++++------ company_linkage/sql/patent_visualization_data.sql | 10 ++++------ .../sql/visualization_data_omit_by_rule.sql | 10 ++++------ company_linkage/sql/workforce_visualization_data.sql | 10 ++++------ 4 files changed, 16 insertions(+), 24 deletions(-) diff --git a/company_linkage/sql/paper_visualization_data.sql b/company_linkage/sql/paper_visualization_data.sql index 00e03c81..122b58a8 100644 --- a/company_linkage/sql/paper_visualization_data.sql +++ b/company_linkage/sql/paper_visualization_data.sql @@ -1,20 +1,18 @@ -CREATE OR REPLACE TABLE - `gcp-cset-projects.ai_companies_visualization.paper_visualization_data` AS -- Selecting the companies we want to leave out WITH to_omit AS ( SELECT CSET_id FROM - ai_companies_visualization.visualization_data + staging_ai_companies_visualization.visualization_data_omit_by_year RIGHT JOIN - ai_companies_visualization.paper_visualization_data + staging_ai_companies_visualization.paper_visualization_data_with_methods USING (cset_id) - WHERE visualization_data.cset_id IS NULL) + WHERE visualization_data_omit_by_year.cset_id IS NULL) SELECT * FROM - `gcp-cset-projects.ai_companies_visualization.paper_visualization_data` + staging_ai_companies_visualization.paper_visualization_data_with_methods WHERE CSET_id NOT IN ( SELECT diff --git a/company_linkage/sql/patent_visualization_data.sql b/company_linkage/sql/patent_visualization_data.sql index 8781112e..29818a23 100644 --- a/company_linkage/sql/patent_visualization_data.sql +++ b/company_linkage/sql/patent_visualization_data.sql @@ -1,20 +1,18 @@ -CREATE OR REPLACE TABLE - `gcp-cset-projects.ai_companies_visualization.patent_visualization_data` AS -- Selecting the companies we want to leave out WITH to_omit AS ( SELECT CSET_id FROM - ai_companies_visualization.visualization_data + staging_ai_companies_visualization.visualization_data_omit_by_rule RIGHT JOIN - ai_companies_visualization.patent_visualization_data + staging_ai_companies_visualization.patent_visualization_data_with_by_year USING (cset_id) - WHERE visualization_data.cset_id IS NULL) + WHERE visualization_data_omit_by_rule.cset_id IS NULL) SELECT * FROM - `gcp-cset-projects.ai_companies_visualization.patent_visualization_data` + staging_ai_companies_visualization.patent_visualization_data_with_by_year WHERE CSET_id NOT IN ( SELECT diff --git a/company_linkage/sql/visualization_data_omit_by_rule.sql b/company_linkage/sql/visualization_data_omit_by_rule.sql index 597e871c..bef7cb15 100644 --- a/company_linkage/sql/visualization_data_omit_by_rule.sql +++ b/company_linkage/sql/visualization_data_omit_by_rule.sql @@ -1,17 +1,15 @@ -CREATE OR REPLACE TABLE - `gcp-cset-projects.ai_companies_visualization.visualization_data` AS -- Selecting the companies we want to leave out WITH to_omit AS ( SELECT CSET_id FROM - ai_companies_visualization.visualization_data + staging_ai_companies_visualization.visualization_data_with_all_papers LEFT JOIN - ai_companies_visualization.patent_visualization_data + staging_ai_companies_visualization.patent_visualization_data_with_by_year USING (cset_id) LEFT JOIN - ai_companies_visualization.workforce_visualization_data + staging_ai_companies_visualization.workforce_visualization_data_with_ai_jobs USING (cset_id) WHERE @@ -32,7 +30,7 @@ WITH SELECT * FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data` + staging_ai_companies_visualization.visualization_data_with_all_papers WHERE CSET_id NOT IN ( SELECT diff --git a/company_linkage/sql/workforce_visualization_data.sql b/company_linkage/sql/workforce_visualization_data.sql index 40a25b3c..3779e5e3 100644 --- a/company_linkage/sql/workforce_visualization_data.sql +++ b/company_linkage/sql/workforce_visualization_data.sql @@ -1,20 +1,18 @@ -CREATE OR REPLACE TABLE - `gcp-cset-projects.ai_companies_visualization.workforce_visualization_data` AS -- Selecting the companies we want to leave out WITH to_omit AS ( SELECT CSET_id FROM - ai_companies_visualization.visualization_data + staging_ai_companies_visualization.visualization_data_omit_by_rule RIGHT JOIN - ai_companies_visualization.workforce_visualization_data + staging_ai_companies_visualization.workforce_visualization_data_with_ai_jobs USING (cset_id) - WHERE visualization_data.cset_id IS NULL) + WHERE visualization_data_omit_by_rule.cset_id IS NULL) SELECT * FROM - `gcp-cset-projects.ai_companies_visualization.workforce_visualization_data` + staging_ai_companies_visualization.workforce_visualization_data_with_ai_jobs WHERE CSET_id NOT IN ( SELECT From cc60fb2aa45c0ae34414d8a612f86a0121e3b021 Mon Sep 17 00:00:00 2001 From: Rebecca Date: Fri, 13 Oct 2023 12:45:59 -0400 Subject: [PATCH 10/17] Update crunchbase linkage --- company_linkage/sql/visualization_data.sql | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/company_linkage/sql/visualization_data.sql b/company_linkage/sql/visualization_data.sql index 1f48d965..ad7022b7 100644 --- a/company_linkage/sql/visualization_data.sql +++ b/company_linkage/sql/visualization_data.sql @@ -1,14 +1,12 @@ -- We're adding useful Crunchbase data to the visualization: descriptions, logos, and the company's "stage" -- (which we're using as a proxy for its size/growth but is actually based on what funding it has received). -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS WITH -- Pull in all the visualization data, most importantly including the crunchbase uuid that will be used to connect to everything else visualization AS ( SELECT * FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data`), + staging_ai_companies_visualization.visualization_data_omit_by_rule), -- Grab the descriptions and logos from Crunchbase ODM odm_data AS ( SELECT @@ -16,7 +14,7 @@ WITH short_description, logo_url FROM - `gcp-cset-projects.gcp_cset_crunchbase.organizations_odm`), + gcp_cset_crunchbase.organizations_odm), -- Grab the raw stage data for companies -- Since companies have multiple funding rounds they may have multiple rows! -- We need to deal with this @@ -51,7 +49,7 @@ WITH END AS stage FROM - `gcp-cset-projects.gcp_cset_crunchbase.funding_rounds`), + gcp_cset_crunchbase.funding_rounds), -- Now we want only one stage value to come out for any given company -- If a company has ever been mature, it's no longer growth or startup, etc. -- So there's a clear hierarchy, and we take the max @@ -78,9 +76,9 @@ WITH FROM combine_stages LEFT JOIN - gcp_cset_crunchbase.organizations orgs + gcp_cset_crunchbase.organizations ON - combine_stages.org_uuid = orgs.uuid ), + combine_stages.org_uuid = organizations.uuid ), stage_name AS ( SELECT org_uuid, @@ -111,7 +109,7 @@ FROM LEFT JOIN odm_data ON - visualization.crunchbase.crunchbase_uuid = odm_data.uuid + TRIM(visualization.crunchbase.crunchbase_uuid) = TRIM(odm_data.uuid) LEFT JOIN stage_name ON From 28a0937a547de98e626f121ecb125feb34704223 Mon Sep 17 00:00:00 2001 From: Rebecca Date: Fri, 13 Oct 2023 16:37:24 -0400 Subject: [PATCH 11/17] Reorder readme to reflect new ordering --- company_linkage/README.md | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/company_linkage/README.md b/company_linkage/README.md index e3248a3b..6a585284 100644 --- a/company_linkage/README.md +++ b/company_linkage/README.md @@ -33,23 +33,23 @@ run some of this code as-is. 15. Upload `all_paper_counts.jsonl` to `ai_companies_visualization.total_paper_counts` using the [all_papers_schema](schemas/all_papers_schema.json) 16. [initial_visualization_data.sql](sql/initial_visualization_data.sql) 17. [visualization_data_with_by_year.sql](sql/visualization_data_with_by_year.sql) -18. [initial_patent_visualization_data.sql](sql/initial_patent_visualization_data.sql) -19. [patent_visualization_data_with_by_year.sql](sql/patent_visualization_data_with_by_year.sql) -20. [initial_paper_visualization_data.sql](sql/initial_paper_visualization_data.sql) -21. [paper_visualization_data_with_mag.sql](sql/paper_visualization_data_with_mag.sql) -22. [paper_visualization_data_with_clusters.sql](sql/paper_visualization_data_with_clusters.sql) -23. [paper_visualization_data_with_company_references.sql](sql/paper_visualization_data_with_company_references.sql) -24. [paper_visualization_data_with_tasks.sql](sql/paper_visualization_data_with_tasks.sql) -25. [paper_visualization_data_with_methods.sql](sql/paper_visualization_data_with_methods.sql) -26. [visualization_data_with_top_papers.sql](sql/visualization_data_with_top_papers.sql) -27. [visualization_data_with_all_papers.sql](sql/visualization_data_with_all_papers.sql) +18. [visualization_data_with_top_papers.sql](sql/visualization_data_with_top_papers.sql) +19. [visualization_data_with_all_papers.sql](sql/visualization_data_with_all_papers.sql) +20. [initial_patent_visualization_data.sql](sql/initial_patent_visualization_data.sql) +21. [patent_visualization_data_with_by_year.sql](sql/patent_visualization_data_with_by_year.sql) +22. [initial_paper_visualization_data.sql](sql/initial_paper_visualization_data.sql) +23. [paper_visualization_data_with_mag.sql](sql/paper_visualization_data_with_mag.sql) +24. [paper_visualization_data_with_clusters.sql](sql/paper_visualization_data_with_clusters.sql) +25. [paper_visualization_data_with_company_references.sql](sql/paper_visualization_data_with_company_references.sql) +26. [paper_visualization_data_with_tasks.sql](sql/paper_visualization_data_with_tasks.sql) +27. [paper_visualization_data_with_methods.sql](sql/paper_visualization_data_with_methods.sql) 28. [initial_workforce_visualization_data.sql](sql/initial_workforce_visualization_data.sql) 29. [workforce_visualization_data_with_ai_jobs.sql](sql/workforce_visualization_data_with_ai_jobs.sql) 30. [visualization_data_omit_by_rule.sql](sql/visualization_data_omit_by_rule.sql) -31. [paper_visualization_data.sql](sql/paper_visualization_data.sql) +31. [visualization_data.sql](sql/visualization_data.sql) 32. [patent_visualization_data.sql](sql/patent_visualization_data.sql) -33. [workforce_visualization_data.sql](sql/workforce_visualization_data.sql) -34. [visualization_data.sql](sql/visualization_data.sql) +33. [paper_visualization_data.sql](sql/paper_visualization_data.sql) +34. [workforce_visualization_data.sql](sql/workforce_visualization_data.sql) # Deployment From 02f7ee1d45cc30982c085478028e2f520d895426 Mon Sep 17 00:00:00 2001 From: Rebecca Date: Mon, 16 Oct 2023 10:27:33 -0400 Subject: [PATCH 12/17] Add some initial check queries --- .../sql/check_all_paper_counts_greater.sql | 5 +++ .../sql/check_paper_counts_exist.sql | 11 +++++ .../sql/check_paper_have_all_ids_pre_omit.sql | 11 +++++ .../sql/check_patent_counts_exist.sql | 40 +++++++++++++++++++ .../check_patent_have_all_ids_pre_omit.sql | 11 +++++ ...ck_visualization_have_all_ids_pre_omit.sql | 11 +++++ .../check_workforce_have_all_ids_pre_omit.sql | 11 +++++ 7 files changed, 100 insertions(+) create mode 100644 company_linkage/sql/check_all_paper_counts_greater.sql create mode 100644 company_linkage/sql/check_paper_counts_exist.sql create mode 100644 company_linkage/sql/check_paper_have_all_ids_pre_omit.sql create mode 100644 company_linkage/sql/check_patent_counts_exist.sql create mode 100644 company_linkage/sql/check_patent_have_all_ids_pre_omit.sql create mode 100644 company_linkage/sql/check_visualization_have_all_ids_pre_omit.sql create mode 100644 company_linkage/sql/check_workforce_have_all_ids_pre_omit.sql diff --git a/company_linkage/sql/check_all_paper_counts_greater.sql b/company_linkage/sql/check_all_paper_counts_greater.sql new file mode 100644 index 00000000..f3e20b8b --- /dev/null +++ b/company_linkage/sql/check_all_paper_counts_greater.sql @@ -0,0 +1,5 @@ +SELECT + LOGICAL_AND(all_pubs >= ai_pubs) + AND LOGICAL_AND(all_pubs >= ai_pubs_in_top_conferences) +FROM + staging_ai_companies_visualization.visualization_data_with_all_papers \ No newline at end of file diff --git a/company_linkage/sql/check_paper_counts_exist.sql b/company_linkage/sql/check_paper_counts_exist.sql new file mode 100644 index 00000000..4171ab87 --- /dev/null +++ b/company_linkage/sql/check_paper_counts_exist.sql @@ -0,0 +1,11 @@ +SELECT + COUNT(*) = 0 +FROM + staging_ai_companies_visualization.visualization_data_with_all_papers +WHERE + ai_pubs IS NULL + OR robotics_pubs IS NULL + OR cv_pubs IS NULL + OR nlp_pubs IS NULL + OR ai_pubs_in_top_conferences IS NULL + OR all_pubs IS NULL \ No newline at end of file diff --git a/company_linkage/sql/check_paper_have_all_ids_pre_omit.sql b/company_linkage/sql/check_paper_have_all_ids_pre_omit.sql new file mode 100644 index 00000000..3b33804a --- /dev/null +++ b/company_linkage/sql/check_paper_have_all_ids_pre_omit.sql @@ -0,0 +1,11 @@ +-- Check that the paper visualization data table has all the CSET organization ids in the table before +-- we run omit by rule +SELECT + COUNT(DISTINCT paper_visualization_data_with_methods.CSET_id) = COUNT(DISTINCT aggregated_organizations.CSET_id) + AND LOGICAL_AND(paper_visualization_data_with_methods.CSET_id IS NOT NULL) +FROM + staging_ai_companies_visualization.paper_visualization_data_with_methods +FULL OUTER JOIN + high_resolution_entities.aggregated_organizations +USING + (CSET_id) \ No newline at end of file diff --git a/company_linkage/sql/check_patent_counts_exist.sql b/company_linkage/sql/check_patent_counts_exist.sql new file mode 100644 index 00000000..e90821f6 --- /dev/null +++ b/company_linkage/sql/check_patent_counts_exist.sql @@ -0,0 +1,40 @@ +SELECT + COUNT(*) = 0 +FROM + staging_ai_companies_visualization.patent_visualization_data_with_by_year +WHERE + ai_patents IS NULL + OR Physical_Sciences_and_Engineering_pats IS NULL + OR Life_Sciences_pats IS NULL + OR Security__eg_cybersecurity_pats IS NULL + OR Transportation_pats IS NULL + OR Education_pats IS NULL + OR Document_Mgt_and_Publishing_pats IS NULL + OR Military_pats IS NULL + OR Agricultural_pats IS NULL + OR Computing_in_Government_pats IS NULL + OR Personal_Devices_and_Computing_pats IS NULL + OR Banking_and_Finance_pats IS NULL + OR Telecommunications_pats IS NULL + OR Networks__eg_social_IOT_etc_pats IS NULL + OR Business_pats IS NULL + OR Energy_Management_pats IS NULL + OR Entertainment_pats IS NULL + OR Nanotechnology_pats IS NULL + OR Semiconductors_pats IS NULL + OR Language_Processing_pats IS NULL + OR Speech_Processing_pats IS NULL + OR Knowledge_Representation_pats IS NULL + OR Planning_and_Scheduling_pats IS NULL + OR Control_pats IS NULL + OR Distributed_AI_pats IS NULL + OR Robotics_pats IS NULL + OR Computer_Vision_pats IS NULL + OR Analytics_and_Algorithms_pats IS NULL + OR Measuring_and_Testing_pats IS NULL + OR Logic_Programming_pats IS NULL + OR Fuzzy_Logic_pats IS NULL + OR Probabilistic_Reasoning_pats IS NULL + OR Ontology_Engineering_pats IS NULL + OR Machine_Learning_pats IS NULL + OR Search_Methods_pats IS NULL \ No newline at end of file diff --git a/company_linkage/sql/check_patent_have_all_ids_pre_omit.sql b/company_linkage/sql/check_patent_have_all_ids_pre_omit.sql new file mode 100644 index 00000000..3aa42d29 --- /dev/null +++ b/company_linkage/sql/check_patent_have_all_ids_pre_omit.sql @@ -0,0 +1,11 @@ +-- Check that the patent visualization data table has all the CSET organization ids in the table before +-- we run omit by rule +SELECT + COUNT(DISTINCT patent_visualization_data_with_by_year.CSET_id) = COUNT(DISTINCT aggregated_organizations.CSET_id) + AND LOGICAL_AND(patent_visualization_data_with_by_year.CSET_id IS NOT NULL) +FROM + staging_ai_companies_visualization.patent_visualization_data_with_by_year +FULL OUTER JOIN + high_resolution_entities.aggregated_organizations +USING + (CSET_id) \ No newline at end of file diff --git a/company_linkage/sql/check_visualization_have_all_ids_pre_omit.sql b/company_linkage/sql/check_visualization_have_all_ids_pre_omit.sql new file mode 100644 index 00000000..cd5532fb --- /dev/null +++ b/company_linkage/sql/check_visualization_have_all_ids_pre_omit.sql @@ -0,0 +1,11 @@ +-- Check that the visualization data table has all the CSET organization ids in the table before +-- we run omit by rule +SELECT + COUNT(DISTINCT visualization_data_with_all_papers.CSET_id) = COUNT(DISTINCT aggregated_organizations.CSET_id) + AND LOGICAL_AND(visualization_data_with_all_papers.CSET_id IS NOT NULL) +FROM + staging_ai_companies_visualization.visualization_data_with_all_papers +FULL OUTER JOIN + high_resolution_entities.aggregated_organizations +USING + (CSET_id) \ No newline at end of file diff --git a/company_linkage/sql/check_workforce_have_all_ids_pre_omit.sql b/company_linkage/sql/check_workforce_have_all_ids_pre_omit.sql new file mode 100644 index 00000000..fb9aaeb2 --- /dev/null +++ b/company_linkage/sql/check_workforce_have_all_ids_pre_omit.sql @@ -0,0 +1,11 @@ +-- Check that the workforce visualization data table has all the CSET organization ids in the table before +-- we run omit by rule +SELECT + COUNT(DISTINCT workforce_visualization_data_with_ai_jobs.CSET_id) = COUNT(DISTINCT aggregated_organizations.CSET_id) + AND LOGICAL_AND(workforce_visualization_data_with_ai_jobs.CSET_id IS NOT NULL) +FROM + staging_ai_companies_visualization.workforce_visualization_data_with_ai_jobs +FULL OUTER JOIN + high_resolution_entities.aggregated_organizations +USING + (CSET_id) \ No newline at end of file From 8a00cd0f4573f10712384b2b9432e088c53262c8 Mon Sep 17 00:00:00 2001 From: Rebecca Date: Tue, 17 Oct 2023 11:53:02 -0400 Subject: [PATCH 13/17] Switch name to merged_id in predictions table --- company_linkage/sql/ai_publications.sql | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/company_linkage/sql/ai_publications.sql b/company_linkage/sql/ai_publications.sql index 2aacb37f..0a000a1d 100644 --- a/company_linkage/sql/ai_publications.sql +++ b/company_linkage/sql/ai_publications.sql @@ -1,10 +1,7 @@ - -- Pulling every AI-associated publication id linked to every grid id and every organization name - -- We also include years because we'll want those later for yearly counts - -- and cv/robotics/nlp so we can filter on these WITH ai_papers AS ( SELECT - cset_id AS merged_id, + merged_id, cv_filtered, nlp_filtered, robotics_filtered From 5fa80da27c2bb9aec15534cd68e831f9b715a51c Mon Sep 17 00:00:00 2001 From: Rebecca Date: Fri, 1 Dec 2023 15:17:22 -0500 Subject: [PATCH 14/17] Update for tasks and methods --- .../sql/paper_visualization_data_with_methods.sql | 8 +++----- .../sql/paper_visualization_data_with_tasks.sql | 8 +++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/company_linkage/sql/paper_visualization_data_with_methods.sql b/company_linkage/sql/paper_visualization_data_with_methods.sql index 853177cd..991a122e 100644 --- a/company_linkage/sql/paper_visualization_data_with_methods.sql +++ b/company_linkage/sql/paper_visualization_data_with_methods.sql @@ -1,12 +1,10 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.paper_visualization_data AS WITH articles_with_ai_methods AS ( SELECT DISTINCT merged_id, referent, FROM - `gcp-cset-projects.tasks_and_methods.method_referents` + tasks_and_methods.method_referents CROSS JOIN UNNEST(referents) AS referent), company_articles_with_methods AS ( @@ -44,10 +42,10 @@ WITH GROUP BY CSET_id) SELECT - paper_visualization_data.*, + paper_visualization_data_with_tasks.*, methods FROM - ai_companies_visualization.paper_visualization_data + ai_companies_visualization.paper_visualization_data_with_tasks LEFT JOIN aggregated_fields USING diff --git a/company_linkage/sql/paper_visualization_data_with_tasks.sql b/company_linkage/sql/paper_visualization_data_with_tasks.sql index afaf256c..b2c6b157 100644 --- a/company_linkage/sql/paper_visualization_data_with_tasks.sql +++ b/company_linkage/sql/paper_visualization_data_with_tasks.sql @@ -1,12 +1,10 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.paper_visualization_data AS WITH articles_with_ai_tasks AS ( SELECT DISTINCT merged_id, referent, FROM - `gcp-cset-projects.tasks_and_methods.task_referents` + tasks_and_methods.task_referents CROSS JOIN UNNEST(referents) AS referent), company_articles_with_tasks AS ( @@ -44,10 +42,10 @@ WITH GROUP BY CSET_id) SELECT - paper_visualization_data.*, + paper_visualization_data_with_company_references.*, tasks FROM - ai_companies_visualization.paper_visualization_data + ai_companies_visualization.paper_visualization_data_with_company_references LEFT JOIN aggregated_fields USING From a9d54cb0f8acdce54c745da200db1b407bc8d01a Mon Sep 17 00:00:00 2001 From: Rebecca Date: Mon, 4 Dec 2023 17:57:33 -0500 Subject: [PATCH 15/17] Finalize pipeline; fix idempotency; add schemas --- company_linkage/parat_data_dag.py | 63 +- .../schemas/paper_visualization_data.json | 128 +++ .../schemas/patent_visualization_data.json | 950 ++++++++++++++++++ .../schemas/visualization_data.json | 376 +++++++ .../schemas/workforce_visualization_data.json | 20 + .../sql/initial_paper_visualization_data.sql | 16 +- .../initial_workforce_visualization_data.sql | 17 +- .../sql/paper_visualization_data.sql | 4 +- .../paper_visualization_data_with_methods.sql | 4 +- .../paper_visualization_data_with_tasks.sql | 4 +- ...kforce_visualization_data_with_ai_jobs.sql | 23 +- 11 files changed, 1581 insertions(+), 24 deletions(-) create mode 100644 company_linkage/schemas/paper_visualization_data.json create mode 100644 company_linkage/schemas/patent_visualization_data.json create mode 100644 company_linkage/schemas/visualization_data.json create mode 100644 company_linkage/schemas/workforce_visualization_data.json diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py index 0bde5f58..c3d3d3e5 100644 --- a/company_linkage/parat_data_dag.py +++ b/company_linkage/parat_data_dag.py @@ -4,10 +4,11 @@ from airflow import DAG from airflow.operators.python import PythonOperator from airflow.operators.trigger_dagrun import TriggerDagRunOperator -from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator +from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator, BigQueryCheckOperator from airflow.providers.google.cloud.operators.cloud_sql import ( CloudSQLImportInstanceOperator, ) +from airflow.providers.google.cloud.transfers.bigquery_to_bigquery import BigQueryToBigQueryOperator from airflow.providers.google.cloud.operators.kubernetes_engine import GKEStartPodOperator from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator from airflow.operators.dummy import DummyOperator @@ -19,10 +20,12 @@ DATA_BUCKET, PROJECT_ID, GCP_ZONE, + DAGS_DIR, get_default_args, get_post_success, ) from dataloader.scripts.populate_documentation import update_table_descriptions + from parat_scripts.aggregate_organizations import aggregate_organizations bucket = DATA_BUCKET @@ -30,6 +33,7 @@ intermediate_dataset = "high_resolution_entities" production_dataset = "ai_companies_visualization" staging_dataset = f"staging_{production_dataset}" +backups_dataset = f"{production_dataset}_backups" sql_dir = "sql/parat" schema_dir = "parat/schemas" tmp_dir = f"{production_dataset}/tmp" @@ -49,7 +53,8 @@ "staging_dataset": staging_dataset, "production_dataset": production_dataset, "intermediate_dataset": intermediate_dataset, - "initial_dataset": initial_dataset + "initial_dataset": initial_dataset, + "backups_dataset": backups_dataset, }, ) with dag: @@ -293,6 +298,51 @@ curr = next_tab curr >> wait_for_visualization_tables + checks = [] + for query in os.listdir(f"{DAGS_DIR}/{sql_dir}"): + if not query.startswith("check_"): + continue + checks.append(BigQueryCheckOperator( + task_id=query.replace(".sql", ""), + sql=f"{sql_dir}/{query}", + use_legacy_sql=False + )) + + wait_for_checks = DummyOperator(task_id="wait_for_checks") + + wait_for_copy = DummyOperator(task_id="wait_for_copy") + + curr_date = datetime.now().strftime('%Y%m%d') + prod_tables = ["visualization_data", "paper_visualization_data", + "patent_visualization_data", "workforce_visualization_data"] + for table in prod_tables: + prod_table_name = f"{production_dataset}.{table}" + copy_to_production = BigQueryToBigQueryOperator( + task_id="copy_" + table + "_to_production", + source_project_dataset_tables=[staging_dataset + "." + table], + destination_project_dataset_table=prod_table_name, + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + pop_descriptions = PythonOperator( + task_id="populate_column_documentation_for_" + table, + op_kwargs={ + "input_schema": f"{os.environ.get('DAGS_FOLDER')}/schemas/parat/{table}.json", + "table_name": prod_table_name + }, + python_callable=update_table_descriptions + ) + table_backup = BigQueryToBigQueryOperator( + task_id=f"back_up_{table}", + source_project_dataset_tables=[f"{staging_dataset}.{table}"], + destination_project_dataset_table=f"{backups_dataset}.{table}_{curr_date}", + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + wait_for_checks >> copy_to_production >> pop_descriptions >> table_backup >> wait_for_copy + + # post success to slack + msg_success = get_post_success("PARAT tables updated!", dag) ( clear_tmp_dir @@ -312,4 +362,13 @@ >> load_all_papers >> start_visualization_tables ) + ( + wait_for_visualization_tables + >> checks + >> wait_for_checks + ) + ( + wait_for_copy + >> msg_success + ) diff --git a/company_linkage/schemas/paper_visualization_data.json b/company_linkage/schemas/paper_visualization_data.json new file mode 100644 index 00000000..cc9e5aab --- /dev/null +++ b/company_linkage/schemas/paper_visualization_data.json @@ -0,0 +1,128 @@ +[ + { + "mode": "NULLABLE", + "name": "CSET_id", + "type": "INTEGER", + "description": "CSET id of PARAT company." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year cited." + }, + { + "mode": "NULLABLE", + "name": "citation_count", + "type": "INTEGER", + "description": "Count of publications in that year that cite AI papers written by the company." + } + ], + "mode": "REPEATED", + "name": "citation_count_by_year", + "type": "RECORD", + "description": "Citations of AI papers by the company by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "field_name", + "type": "STRING", + "description": "Field of study name." + }, + { + "mode": "NULLABLE", + "name": "field_count", + "type": "INTEGER", + "description": "Count of AI papers by the company where field of study is in their top fields." + } + ], + "mode": "REPEATED", + "name": "fields", + "type": "RECORD", + "description": "Fields of study counts (using MAG-style fields of study for AI-relevant fields)." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "cluster_id", + "type": "INTEGER", + "description": "Map of Science research cluster ID." + }, + { + "mode": "NULLABLE", + "name": "cluster_count", + "type": "INTEGER", + "description": "Count of how many AI publications from the company appear in that cluster." + } + ], + "mode": "REPEATED", + "name": "clusters", + "type": "RECORD", + "description": "Counts of top publications in research clusters." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "ref_CSET_id", + "type": "INTEGER", + "description": "CSET id of referenced PARAT company." + }, + { + "mode": "NULLABLE", + "name": "referenced_count", + "type": "INTEGER", + "description": "Count of how many AI publications by that company the primary PARAT company has referenced in their papers." + } + ], + "mode": "REPEATED", + "name": "company_references", + "type": "RECORD", + "description": "Counts of publication references to the publications of other companies in the PARAT dataset." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "referent", + "type": "STRING", + "description": "The task name referent." + }, + { + "mode": "NULLABLE", + "name": "task_count", + "type": "INTEGER", + "description": "Count of how many AI publications by the company contain this task." + } + ], + "mode": "REPEATED", + "name": "tasks", + "type": "RECORD", + "description": "AI task information." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "referent", + "type": "STRING", + "description": "The method name referent." + }, + { + "mode": "NULLABLE", + "name": "method_count", + "type": "INTEGER", + "description": "Count of how many AI publications by the company contain this method." + } + ], + "mode": "REPEATED", + "name": "methods", + "type": "RECORD", + "description": "AI method information." + } +] \ No newline at end of file diff --git a/company_linkage/schemas/patent_visualization_data.json b/company_linkage/schemas/patent_visualization_data.json new file mode 100644 index 00000000..42496163 --- /dev/null +++ b/company_linkage/schemas/patent_visualization_data.json @@ -0,0 +1,950 @@ +[ + { + "mode": "NULLABLE", + "name": "CSET_id", + "type": "INTEGER", + "description": "CSET id of PARAT company." + }, + { + "mode": "NULLABLE", + "name": "name", + "type": "STRING", + "description": "Name of PARAT company." + }, + { + "mode": "NULLABLE", + "name": "ai_patents", + "type": "INTEGER", + "description": "Total AI patent families." + }, + { + "mode": "NULLABLE", + "name": "Physical_Sciences_and_Engineering_pats", + "type": "INTEGER", + "description": "AI patent families in physical science and engineering application category." + }, + { + "mode": "NULLABLE", + "name": "Life_Sciences_pats", + "type": "INTEGER", + "description": "AI patent families in life sciences application category." + }, + { + "mode": "NULLABLE", + "name": "Security__eg_cybersecurity_pats", + "type": "INTEGER", + "description": "AI patent families in security (e.g. cybersecurity) application category." + }, + { + "mode": "NULLABLE", + "name": "Transportation_pats", + "type": "INTEGER", + "description": "AI patent families in transportation application category." + }, + { + "mode": "NULLABLE", + "name": "Industrial_and_Manufacturing_pats", + "type": "INTEGER", + "description": "AI patent families in industrial and manufacturing application category." + }, + { + "mode": "NULLABLE", + "name": "Education_pats", + "type": "INTEGER", + "description": "AI patent families in education application category." + }, + { + "mode": "NULLABLE", + "name": "Document_Mgt_and_Publishing_pats", + "type": "INTEGER", + "description": "AI patent families in document management and publishing application category." + }, + { + "mode": "NULLABLE", + "name": "Military_pats", + "type": "INTEGER", + "description": "AI patent families in military application category." + }, + { + "mode": "NULLABLE", + "name": "Agricultural_pats", + "type": "INTEGER", + "description": "AI patent families in agricultural application category." + }, + { + "mode": "NULLABLE", + "name": "Computing_in_Government_pats", + "type": "INTEGER", + "description": "AI patent families in computing in government application category." + }, + { + "mode": "NULLABLE", + "name": "Personal_Devices_and_Computing_pats", + "type": "INTEGER", + "description": "AI patent families in personal devices and computing application category." + }, + { + "mode": "NULLABLE", + "name": "Banking_and_Finance_pats", + "type": "INTEGER", + "description": "AI patent families in banking and finance application category." + }, + { + "mode": "NULLABLE", + "name": "Telecommunications_pats", + "type": "INTEGER", + "description": "AI patent families in telecommunications application category." + }, + { + "mode": "NULLABLE", + "name": "Networks__eg_social_IOT_etc_pats", + "type": "INTEGER", + "description": "AI patent families in networks (e.g. social, IOT, etc.) application category." + }, + { + "mode": "NULLABLE", + "name": "Business_pats", + "type": "INTEGER", + "description": "AI patent families in business application category." + }, + { + "mode": "NULLABLE", + "name": "Energy_Management_pats", + "type": "INTEGER", + "description": "AI patent families in energy management application category." + }, + { + "mode": "NULLABLE", + "name": "Entertainment_pats", + "type": "INTEGER", + "description": "AI patent families in entertainment application category." + }, + { + "mode": "NULLABLE", + "name": "Nanotechnology_pats", + "type": "INTEGER", + "description": "AI patent families in nanotechnology application category." + }, + { + "mode": "NULLABLE", + "name": "Semiconductors_pats", + "type": "INTEGER", + "description": "AI patent families in semiconductors application category." + }, + { + "mode": "NULLABLE", + "name": "Language_Processing_pats", + "type": "INTEGER", + "description": "AI patent families in language processing functional application category." + }, + { + "mode": "NULLABLE", + "name": "Speech_Processing_pats", + "type": "INTEGER", + "description": "AI patent families in speech processing functional application category." + }, + { + "mode": "NULLABLE", + "name": "Knowledge_Representation_pats", + "type": "INTEGER", + "description": "AI patent families in knowledge representation functional application category." + }, + { + "mode": "NULLABLE", + "name": "Planning_and_Scheduling_pats", + "type": "INTEGER", + "description": "AI patent families in planning and scheduling functional application category." + }, + { + "mode": "NULLABLE", + "name": "Control_pats", + "type": "INTEGER", + "description": "AI patent families in control functional application category." + }, + { + "mode": "NULLABLE", + "name": "Distributed_AI_pats", + "type": "INTEGER", + "description": "AI patent families in distributed AI functional application category." + }, + { + "mode": "NULLABLE", + "name": "Robotics_pats", + "type": "INTEGER", + "description": "AI patent families in robotics functional application category." + }, + { + "mode": "NULLABLE", + "name": "Computer_Vision_pats", + "type": "INTEGER", + "description": "AI patent families in computer vision functional application category." + }, + { + "mode": "NULLABLE", + "name": "Analytics_and_Algorithms_pats", + "type": "INTEGER", + "description": "AI patent families in analytics and algorithms functional application category." + }, + { + "mode": "NULLABLE", + "name": "Measuring_and_Testing_pats", + "type": "INTEGER", + "description": "AI patent families in measuring and testing functional application category." + }, + { + "mode": "NULLABLE", + "name": "Logic_Programming_pats", + "type": "INTEGER", + "description": "AI patent families in logic programming AI techniques category." + }, + { + "mode": "NULLABLE", + "name": "Fuzzy_Logic_pats", + "type": "INTEGER", + "description": "AI patent families in fuzzy logic AI techniques category." + }, + { + "mode": "NULLABLE", + "name": "Probabilistic_Reasoning_pats", + "type": "INTEGER", + "description": "AI patent families in probabilistic reasoning AI techniques category." + }, + { + "mode": "NULLABLE", + "name": "Ontology_Engineering_pats", + "type": "INTEGER", + "description": "AI patent families in ontology engineering AI techniques category." + }, + { + "mode": "NULLABLE", + "name": "Machine_Learning_pats", + "type": "INTEGER", + "description": "AI patent families in machine learning AI techniques category." + }, + { + "mode": "NULLABLE", + "name": "Search_Methods_pats", + "type": "INTEGER", + "description": "AI patent families in search methods AI techniques category." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "ai_patents", + "type": "INTEGER", + "description": "AI patent families count for that year." + } + ], + "mode": "REPEATED", + "name": "ai_patents_by_year", + "type": "RECORD", + "description": "Count of total AI patent families by priority year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Physical_Sciences_and_Engineering_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the physical science and engineering application category for that year." + } + ], + "mode": "REPEATED", + "name": "Physical_Sciences_and_Engineering_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the physical science and engineering application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Life_Sciences_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the life sciences application category for that year." + } + ], + "mode": "REPEATED", + "name": "Life_Sciences_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the life sciences application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Security__eg_cybersecurity_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the security (e.g. cybersecurity) application category for that year." + } + ], + "mode": "REPEATED", + "name": "Security__eg_cybersecurity_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the security (e.g. cybersecurity) application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Transportation_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the transportation application category for that year." + } + ], + "mode": "REPEATED", + "name": "Transportation_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the transportation application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Industrial_and_Manufacturing_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the industrial and manufacturing application category for that year." + } + ], + "mode": "REPEATED", + "name": "Industrial_and_Manufacturing_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the industrial and manufacturing application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Education_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the education application category for that year." + } + ], + "mode": "REPEATED", + "name": "Education_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the education application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Document_Mgt_and_Publishing_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the document management and publishing application category for that year." + } + ], + "mode": "REPEATED", + "name": "Document_Mgt_and_Publishing_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the document management and publishing application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Military_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the military application category for that year." + } + ], + "mode": "REPEATED", + "name": "Military_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the military application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Agricultural_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the agricultural application category for that year." + } + ], + "mode": "REPEATED", + "name": "Agricultural_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the agricultural application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Computing_in_Government_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the computing in government application category for that year." + } + ], + "mode": "REPEATED", + "name": "Computing_in_Government_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the computing in government application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Personal_Devices_and_Computing_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the personal devices and computing application category for that year." + } + ], + "mode": "REPEATED", + "name": "Personal_Devices_and_Computing_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the personal devices and computing application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Banking_and_Finance_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the banking and finance application category for that year." + } + ], + "mode": "REPEATED", + "name": "Banking_and_Finance_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the banking and finance application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Telecommunications_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the telecommunications application category for that year." + } + ], + "mode": "REPEATED", + "name": "Telecommunications_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the telecommunications application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Networks__eg_social_IOT_etc_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the networks (e.g. social, IOT, etc.) application category for that year." + } + ], + "mode": "REPEATED", + "name": "Networks__eg_social_IOT_etc_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the networks (e.g. social, IOT, etc.) application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Business_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the business application category for that year." + } + ], + "mode": "REPEATED", + "name": "Business_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the business application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Energy_Management_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the energy management application category for that year." + } + ], + "mode": "REPEATED", + "name": "Energy_Management_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the energy management application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Entertainment_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the entertainment application category for that year." + } + ], + "mode": "REPEATED", + "name": "Entertainment_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the entertainment application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Nanotechnology_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the nanotechnology application category for that year." + } + ], + "mode": "REPEATED", + "name": "Nanotechnology_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the nanotechnology application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Semiconductors_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the semiconductors application category for that year." + } + ], + "mode": "REPEATED", + "name": "Semiconductors_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the semiconductors application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Language_Processing_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the language processing functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Language_Processing_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the language processing functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Speech_Processing_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the speech processing functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Speech_Processing_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the speech processing functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Knowledge_Representation_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the knowledge representation functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Knowledge_Representation_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the knowledge representation functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Planning_and_Scheduling_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the planning and scheduling functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Planning_and_Scheduling_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the planning and scheduling functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Control_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the control functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Control_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the control functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Distributed_AI_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the distributed AI functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Distributed_AI_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the distributed AI functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Robotics_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the robotics functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Robotics_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the robotics functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Computer_Vision_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the computer vision functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Computer_Vision_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the computer vision functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Analytics_and_Algorithms_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the analytics and algorithms functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Analytics_and_Algorithms_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the analytics and engineering functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Measuring_and_Testing_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the measuring and testing functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Measuring_and_Testing_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the measuring and testing functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Logic_Programming_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the logic programming AI techniques category for that year." + } + ], + "mode": "REPEATED", + "name": "Logic_Programming_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the logic programming AI techniques category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Fuzzy_Logic_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the fuzzy logic AI techniques category for that year." + } + ], + "mode": "REPEATED", + "name": "Fuzzy_Logic_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the fuzzy logic AI techniques category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Probabilistic_Reasoning_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the probabilistic reasoning AI techniques category for that year." + } + ], + "mode": "REPEATED", + "name": "Probabilistic_Reasoning_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the probabilistic reasoning AI techniques category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Ontology_Engineering_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the ontology engineering AI techniques category for that year." + } + ], + "mode": "REPEATED", + "name": "Ontology_Engineering_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the ontology engineering AI techniques category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Machine_Learning_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the machine learning AI techniques category for that year." + } + ], + "mode": "REPEATED", + "name": "Machine_Learning_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the machine learning AI techniques category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Search_Methods_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the search methods AI techniques category for that year." + } + ], + "mode": "REPEATED", + "name": "Search_Methods_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the search methods AI techniques category by year." + } +] \ No newline at end of file diff --git a/company_linkage/schemas/visualization_data.json b/company_linkage/schemas/visualization_data.json new file mode 100644 index 00000000..b34fc089 --- /dev/null +++ b/company_linkage/schemas/visualization_data.json @@ -0,0 +1,376 @@ +[ + { + "mode": "NULLABLE", + "name": "CSET_id", + "type": "INTEGER", + "description": "CSET id of PARAT company." + }, + { + "mode": "NULLABLE", + "name": "name", + "type": "STRING", + "description": "Name of PARAT company." + }, + { + "mode": "NULLABLE", + "name": "country", + "type": "STRING", + "description": "Country of PARAT company. If company is located in multiple countries, country of headquarters." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "alias_language", + "type": "STRING", + "description": "Language alias is written in." + }, + { + "mode": "NULLABLE", + "name": "alias", + "type": "STRING", + "description": "Alias of company." + } + ], + "mode": "REPEATED", + "name": "aliases", + "type": "RECORD", + "description": "List of company aliases." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "parent_acquisition", + "type": "BOOLEAN", + "description": "Boolean indicating whether the company was acquired by its parent company." + }, + { + "mode": "NULLABLE", + "name": "parent_name", + "type": "STRING", + "description": "Name of parent company." + }, + { + "mode": "NULLABLE", + "name": "parent_id", + "type": "INTEGER", + "description": "CSET id of parent company." + } + ], + "mode": "REPEATED", + "name": "parent", + "type": "RECORD", + "description": "List of parent companies." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "child_name", + "type": "STRING", + "description": "Name of child company." + }, + { + "mode": "NULLABLE", + "name": "child_id", + "type": "INTEGER", + "description": "CSET id of child company." + } + ], + "mode": "REPEATED", + "name": "children", + "type": "RECORD", + "description": "List of child companies." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "child_name", + "type": "STRING", + "description": "Name of child company." + }, + { + "mode": "NULLABLE", + "name": "child_id", + "type": "INTEGER", + "description": "CSET id of child companies." + } + ], + "mode": "REPEATED", + "name": "non_agg_children", + "type": "RECORD", + "description": "Name of child company whose data has not been aggregated into the records of the parent company." + }, + { + "mode": "REPEATED", + "name": "permid", + "type": "INTEGER", + "description": "Refinitiv Permid." + }, + { + "mode": "NULLABLE", + "name": "website", + "type": "STRING", + "description": "Company website." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "exchange", + "type": "STRING", + "description": "Exchange on which the company is listed." + }, + { + "mode": "NULLABLE", + "name": "ticker", + "type": "STRING", + "description": "Company ticker." + } + ], + "mode": "REPEATED", + "name": "market", + "type": "RECORD", + "description": "Company exchange and ticker data." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "crunchbase_uuid", + "type": "STRING", + "description": "UUID in Crunchbase." + }, + { + "mode": "NULLABLE", + "name": "crunchbase_url", + "type": "STRING", + "description": "URL on Crunchbase website." + } + ], + "mode": "NULLABLE", + "name": "crunchbase", + "type": "RECORD", + "description": "Crunchbase unique identifier." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "crunchbase_uuid", + "type": "STRING", + "description": "UUID in Crunchbase." + }, + { + "mode": "NULLABLE", + "name": "crunchbase_url", + "type": "STRING", + "description": "URL on Crunchbase website." + } + ], + "mode": "REPEATED", + "name": "child_crunchbase", + "type": "RECORD", + "description": "Crunchbase unique identifiers for any child companies of the aggregated company." + }, + { + "mode": "REPEATED", + "name": "ror_id", + "type": "STRING", + "description": "ROR id for the company." + }, + { + "mode": "REPEATED", + "name": "linkedin", + "type": "STRING", + "description": "LinkedIn website for the company." + }, + { + "mode": "NULLABLE", + "name": "in_sandp_500", + "type": "BOOLEAN", + "description": "Indicator of whether the company was in the S&P 500 at some point during 2020." + }, + { + "mode": "NULLABLE", + "name": "in_fortune_global_500", + "type": "BOOLEAN", + "description": "Indicator of whether the company was on the 2021 Fortune Global 500 list." + }, + { + "mode": "NULLABLE", + "name": "ai_pubs", + "type": "INTEGER", + "description": "Count of total AI publications by the company." + }, + { + "mode": "NULLABLE", + "name": "cv_pubs", + "type": "INTEGER", + "description": "Count of total computer vision publications by the company." + }, + { + "mode": "NULLABLE", + "name": "nlp_pubs", + "type": "INTEGER", + "description": "Count of total natural language processing publications by the company." + }, + { + "mode": "NULLABLE", + "name": "robotics_pubs", + "type": "INTEGER", + "description": "Count of total robotics publications by the company." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year published." + }, + { + "mode": "NULLABLE", + "name": "ai_pubs", + "type": "INTEGER", + "description": "Count of total AI publications by the company in that year." + } + ], + "mode": "REPEATED", + "name": "ai_pubs_by_year", + "type": "RECORD", + "description": "Counts of AI publications by the company by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year published." + }, + { + "mode": "NULLABLE", + "name": "cv_pubs", + "type": "INTEGER", + "description": "Count of total computer vision publications by the company in that year." + } + ], + "mode": "REPEATED", + "name": "cv_pubs_by_year", + "type": "RECORD", + "description": "Counts of computer vision publications by the company by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year published." + }, + { + "mode": "NULLABLE", + "name": "nlp_pubs", + "type": "INTEGER", + "description": "Count of total natural language processing publications by the company in that year." + } + ], + "mode": "REPEATED", + "name": "nlp_pubs_by_year", + "type": "RECORD", + "description": "Counts of natural language processing publications by the company by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year published." + }, + { + "mode": "NULLABLE", + "name": "robotics_pubs", + "type": "INTEGER", + "description": "Count of total robotics publications by the company in that year." + } + ], + "mode": "REPEATED", + "name": "robotics_pubs_by_year", + "type": "RECORD", + "description": "Counts of robotics publications by the company by year." + }, + { + "mode": "NULLABLE", + "name": "ai_pubs_in_top_conferences", + "type": "INTEGER", + "description": "Counts of total AI publications by the company that were published in top AI conferences." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year published." + }, + { + "mode": "NULLABLE", + "name": "ai_pubs_in_top_conferences", + "type": "INTEGER", + "description": "Count of total AI publications by the company that were published in top AI conferences in that year." + } + ], + "mode": "REPEATED", + "name": "ai_pubs_in_top_conferences_by_year", + "type": "RECORD", + "description": "Counts of AI publications in top conferences by the company by year." + }, + { + "mode": "NULLABLE", + "name": "all_pubs", + "type": "INTEGER", + "description": "Count of total publications by the company." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year published." + }, + { + "mode": "NULLABLE", + "name": "all_pubs", + "type": "INTEGER", + "description": "Counts of total publications by the company in that year." + } + ], + "mode": "REPEATED", + "name": "all_pubs_by_year", + "type": "RECORD", + "description": "Counts of publications by the company by year." + }, + { + "mode": "NULLABLE", + "name": "short_description", + "type": "STRING", + "description": "Short description of the company, as drawn from Crunchbase's free interface." + }, + { + "mode": "NULLABLE", + "name": "logo_url", + "type": "STRING", + "description": "URL linking to a picture of the logo of the company, as drawn from Crunchbase's free interface." + }, + { + "mode": "NULLABLE", + "name": "stage", + "type": "STRING", + "description": "Maturity stage of a company." + } +] \ No newline at end of file diff --git a/company_linkage/schemas/workforce_visualization_data.json b/company_linkage/schemas/workforce_visualization_data.json new file mode 100644 index 00000000..f90a8022 --- /dev/null +++ b/company_linkage/schemas/workforce_visualization_data.json @@ -0,0 +1,20 @@ +[ + { + "mode": "NULLABLE", + "name": "cset_id", + "type": "INTEGER", + "description": "CSET id of PARAT company." + }, + { + "mode": "NULLABLE", + "name": "tt1_jobs", + "type": "INTEGER", + "description": "AI jobs as defined under CSET's technical track 1 definition." + }, + { + "mode": "NULLABLE", + "name": "ai_jobs", + "type": "INTEGER", + "description": "AI jobs as defined under a narrower definition within CSET's technical track 1 definition, focused specifically on research and implementation jobs within technical track 1." + } +] \ No newline at end of file diff --git a/company_linkage/sql/initial_paper_visualization_data.sql b/company_linkage/sql/initial_paper_visualization_data.sql index 57d83813..09c085b2 100644 --- a/company_linkage/sql/initial_paper_visualization_data.sql +++ b/company_linkage/sql/initial_paper_visualization_data.sql @@ -33,8 +33,9 @@ WITH add_year GROUP BY CSET_id, - year) -SELECT + year), +all_cited as +(SELECT CSET_id, ARRAY_AGG(STRUCT(year, citation_count) @@ -43,6 +44,15 @@ SELECT FROM by_year GROUP BY - CSET_id + CSET_id) +SELECT + CSET_id, + citation_count_by_year +FROM + high_resolution_entities.aggregated_organizations +LEFT JOIN + all_cited +USING + (CSET_id) ORDER BY CSET_id \ No newline at end of file diff --git a/company_linkage/sql/initial_workforce_visualization_data.sql b/company_linkage/sql/initial_workforce_visualization_data.sql index 1c0f9be9..f6cdbec3 100644 --- a/company_linkage/sql/initial_workforce_visualization_data.sql +++ b/company_linkage/sql/initial_workforce_visualization_data.sql @@ -7,8 +7,9 @@ WITH FROM high_resolution_entities.aggregated_organizations CROSS JOIN - UNNEST (linkedin) AS linkedins) -SELECT + UNNEST (linkedin) AS linkedins), +job_info as +(SELECT DISTINCT cset_id, COUNT(DISTINCT user_id) AS tt1_jobs FROM @@ -41,6 +42,16 @@ WHERE OR ((degree = "Doctor") AND REGEXP_CONTAINS(field_raw, r'(?i)(computer\s+science|computer\s+engineering|electrical\s+engineering)'))) GROUP BY - cset_id + cset_id) +SELECT + DISTINCT + cset_id, + COALESCE(tt1_jobs, 0) as tt1_jobs +FROM + high_resolution_entities.aggregated_organizations +LEFT JOIN + job_info +USING + (cset_id) ORDER BY cset_id \ No newline at end of file diff --git a/company_linkage/sql/paper_visualization_data.sql b/company_linkage/sql/paper_visualization_data.sql index 122b58a8..3f24c27a 100644 --- a/company_linkage/sql/paper_visualization_data.sql +++ b/company_linkage/sql/paper_visualization_data.sql @@ -4,11 +4,11 @@ WITH SELECT CSET_id FROM - staging_ai_companies_visualization.visualization_data_omit_by_year + staging_ai_companies_visualization.visualization_data_omit_by_rule RIGHT JOIN staging_ai_companies_visualization.paper_visualization_data_with_methods USING (cset_id) - WHERE visualization_data_omit_by_year.cset_id IS NULL) + WHERE visualization_data_omit_by_rule.cset_id IS NULL) SELECT * FROM diff --git a/company_linkage/sql/paper_visualization_data_with_methods.sql b/company_linkage/sql/paper_visualization_data_with_methods.sql index 991a122e..e561d138 100644 --- a/company_linkage/sql/paper_visualization_data_with_methods.sql +++ b/company_linkage/sql/paper_visualization_data_with_methods.sql @@ -13,7 +13,7 @@ WITH merged_id, referent FROM - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers LEFT JOIN articles_with_ai_methods USING @@ -45,7 +45,7 @@ SELECT paper_visualization_data_with_tasks.*, methods FROM - ai_companies_visualization.paper_visualization_data_with_tasks + staging_ai_companies_visualization.paper_visualization_data_with_tasks LEFT JOIN aggregated_fields USING diff --git a/company_linkage/sql/paper_visualization_data_with_tasks.sql b/company_linkage/sql/paper_visualization_data_with_tasks.sql index b2c6b157..eed2f588 100644 --- a/company_linkage/sql/paper_visualization_data_with_tasks.sql +++ b/company_linkage/sql/paper_visualization_data_with_tasks.sql @@ -13,7 +13,7 @@ WITH merged_id, referent FROM - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers LEFT JOIN articles_with_ai_tasks USING @@ -45,7 +45,7 @@ SELECT paper_visualization_data_with_company_references.*, tasks FROM - ai_companies_visualization.paper_visualization_data_with_company_references + staging_ai_companies_visualization.paper_visualization_data_with_company_references LEFT JOIN aggregated_fields USING diff --git a/company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql b/company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql index 3f0c174c..a6981174 100644 --- a/company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql +++ b/company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql @@ -1,10 +1,9 @@ -create or replace table ai_companies_visualization.workforce_visualization_data as WITH clean_linkedins AS ( SELECT DISTINCT cset_id, name, - REPLACE(linkedins, "https://www.", "http://") AS linkedin + REPLACE(REPLACE(linkedins, "https://www.", ""), "http://www.", "") AS linkedin FROM high_resolution_entities.aggregated_organizations CROSS JOIN @@ -12,15 +11,15 @@ WITH new_ai_jobs AS ( SELECT DISTINCT cset_id, - COUNT(DISTINCT user_id) AS ai_jobs + COUNT(DISTINCT individual_position.user_id) AS ai_jobs FROM clean_linkedins INNER JOIN - `gcp-cset-projects.gcp_cset_revelio.position` position + revelio.individual_position ON - linkedin = company_li_url + linkedin = company_linkedin_url INNER JOIN - gcp_cset_revelio.role_lookup + revelio.role_lookup USING (mapped_role) INNER JOIN @@ -28,12 +27,16 @@ WITH ON (k1000 = role_k1000) LEFT JOIN - gcp_cset_revelio.education + revelio.individual_education USING (user_id) + LEFT JOIN + revelio.individual_position_descriptions + USING + (position_id) WHERE - (position.enddate IS NULL - OR position.enddate > CURRENT_DATE()) + (individual_position.enddate IS NULL + OR individual_position.enddate > CURRENT_DATE()) AND (ba_req IS FALSE OR ((degree = "Bachelor" OR degree = "Master" @@ -52,7 +55,7 @@ SELECT tt1_jobs, COALESCE(ai_jobs, 0) as ai_jobs FROM - ai_companies_visualization.workforce_visualization_data + staging_ai_companies_visualization.initial_workforce_visualization_data LEFT JOIN new_ai_jobs USING From 38cedbb6664a4a84e73cb391ce1a2b355f8fb362 Mon Sep 17 00:00:00 2001 From: Rebecca Date: Fri, 12 Jan 2024 10:20:54 -0500 Subject: [PATCH 16/17] Clean things up, remove imports, edit node pools, and fix comments --- company_linkage/parat_data_dag.py | 17 +++++------------ .../sql/paper_visualization_data.sql | 5 +++++ .../sql/patent_visualization_data.sql | 5 +++++ .../sql/workforce_visualization_data.sql | 5 +++++ 4 files changed, 20 insertions(+), 12 deletions(-) diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py index c3d3d3e5..4cfcd805 100644 --- a/company_linkage/parat_data_dag.py +++ b/company_linkage/parat_data_dag.py @@ -3,19 +3,12 @@ from airflow import DAG from airflow.operators.python import PythonOperator -from airflow.operators.trigger_dagrun import TriggerDagRunOperator from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator, BigQueryCheckOperator -from airflow.providers.google.cloud.operators.cloud_sql import ( - CloudSQLImportInstanceOperator, -) from airflow.providers.google.cloud.transfers.bigquery_to_bigquery import BigQueryToBigQueryOperator from airflow.providers.google.cloud.operators.kubernetes_engine import GKEStartPodOperator from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator from airflow.operators.dummy import DummyOperator from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator -from airflow.providers.google.cloud.transfers.bigquery_to_gcs import ( - BigQueryToGCSOperator, -) from dataloader.airflow_utils.defaults import ( DATA_BUCKET, PROJECT_ID, @@ -72,7 +65,6 @@ join_tables = [] for table in ["alias", "grid", "ids", "linkedin", "market", "organizations", "parent", "permid"]: - # Grab all the data and write it to unseen_en_corpus join_table = BigQueryInsertJobOperator( task_id=f"join_{table}", configuration={ @@ -105,12 +97,13 @@ curr = start_initial_tables for line in open(seq_path_prefix + initial_query_sequence).readlines(): dataset, table = line.split(",") - table_name = f"{dataset}.{table.strip()}" + table = table.strip() + table_name = f"{dataset}.{table}" next_tab = BigQueryInsertJobOperator( task_id=f"create_{table_name}", configuration={ "query": { - "query": "{% include '" + f"{sql_dir}/{table.strip()}.sql" + "' %}", + "query": "{% include '" + f"{sql_dir}/{table}.sql" + "' %}", "useLegacySql": False, "destinationTable": { "projectId": PROJECT_ID, @@ -155,7 +148,7 @@ task_id="run_get_ai_counts", project_id=PROJECT_ID, location=GCP_ZONE, - cluster_name="us-east1-production2023-cc1-01d75926-gke", + cluster_name="cc2-task-pool", name="run_get_ai_counts", cmds=["/bin/bash"], arguments=["-c", (f"echo 'getting AI counts!' ; rm -r ai || true ; " @@ -213,7 +206,7 @@ task_id=f"run_get_{paper_type}_counts", project_id=PROJECT_ID, location=GCP_ZONE, - cluster_name="us-east1-production2023-cc1-01d75926-gke", + cluster_name="cc2-task-pool", name=f"run_get_{paper_type}_counts", cmds=["/bin/bash"], arguments=["-c", (f"echo 'getting {paper_type} paper counts!' ; rm -r {paper_type} || true ; " diff --git a/company_linkage/sql/paper_visualization_data.sql b/company_linkage/sql/paper_visualization_data.sql index 3f24c27a..d71afe2e 100644 --- a/company_linkage/sql/paper_visualization_data.sql +++ b/company_linkage/sql/paper_visualization_data.sql @@ -1,4 +1,9 @@ -- Selecting the companies we want to leave out + -- Essentially, visualization_data_omit_by_rule contains all the companies that we want + -- to retain after the omit_by_rule process has been applied + -- So, here, in to_omit, we select any company that isn't found in that table as a + -- company we'd like to omit, replicating the rule-based omission. + -- This allows us to omit the same set of companies across all of our tables. WITH to_omit AS ( SELECT diff --git a/company_linkage/sql/patent_visualization_data.sql b/company_linkage/sql/patent_visualization_data.sql index 29818a23..8047506a 100644 --- a/company_linkage/sql/patent_visualization_data.sql +++ b/company_linkage/sql/patent_visualization_data.sql @@ -1,4 +1,9 @@ -- Selecting the companies we want to leave out + -- Essentially, visualization_data_omit_by_rule contains all the companies that we want + -- to retain after the omit_by_rule process has been applied + -- So, here, in to_omit, we select any company that isn't found in that table as a + -- company we'd like to omit, replicating the rule-based omission. + -- This allows us to omit the same set of companies across all of our tables. WITH to_omit AS ( SELECT diff --git a/company_linkage/sql/workforce_visualization_data.sql b/company_linkage/sql/workforce_visualization_data.sql index 3779e5e3..e5cc4f8e 100644 --- a/company_linkage/sql/workforce_visualization_data.sql +++ b/company_linkage/sql/workforce_visualization_data.sql @@ -1,4 +1,9 @@ -- Selecting the companies we want to leave out + -- Essentially, visualization_data_omit_by_rule contains all the companies that we want + -- to retain after the omit_by_rule process has been applied + -- So, here, in to_omit, we select any company that isn't found in that table as a + -- company we'd like to omit, replicating the rule-based omission. + -- This allows us to omit the same set of companies across all of our tables. WITH to_omit AS ( SELECT From a6c2d223a46fde55cdbbf0d5c493eda0d6a535d4 Mon Sep 17 00:00:00 2001 From: Rebecca Date: Wed, 17 Jan 2024 12:17:06 -0500 Subject: [PATCH 17/17] Fix ai papers table to use merged id countries not ror countries --- company_linkage/sql/ai_publications.sql | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/company_linkage/sql/ai_publications.sql b/company_linkage/sql/ai_publications.sql index 0a000a1d..4c87164a 100644 --- a/company_linkage/sql/ai_publications.sql +++ b/company_linkage/sql/ai_publications.sql @@ -13,10 +13,13 @@ WITH -- Adding in org names and country data using ROR SELECT id, - name AS org_name, - country.country_name AS country + ror.name AS org_name, + standard_name AS country FROM - gcp_cset_ror.ror), + gcp_cset_ror.ror + LEFT JOIN + countries.country_code + ON lower(country.country_code) = lower(country_code.raw_alpha_2)), merged_rors AS ( -- Selecting all the merged ids and ror ids from the literature table SELECT