From ee6436e68d8aad81e1598fdb42d6638abeaa2947 Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Thu, 28 Sep 2023 12:16:53 -0400
Subject: [PATCH 01/17] Create initial segment of DAG, through python code

---
 company_linkage/Dockerfile                    |  22 ++
 company_linkage/README.md                     |  72 +++--
 company_linkage/parat_data_dag.py             | 294 ++++++++++++++++++
 .../aggregate_organizations.py                |  23 +-
 .../{ => parat_scripts}/all_papers.py         |   0
 .../deduplicate_companies.py                  |   0
 .../{ => parat_scripts}/get_ai_counts.py      |   0
 .../test_aggregate_organizations.py           |  12 +-
 .../{ => parat_scripts}/test_ai_counts.py     |   2 +-
 .../{ => parat_scripts}/top_papers.py         |   2 +-
 company_linkage/push_to_airflow.sh            |  12 +-
 company_linkage/requirements.txt              |  56 ++++
 ...ema.json => aggregated_organizations.json} |   0
 company_linkage/sequences.txt                 |   0
 company_linkage/sequences/initial_data.csv    |   6 +
 ...i_publications.sql => ai_publications.sql} |   2 -
 ..._publications.sql => all_publications.sql} |   0
 ...g_ai_patents.sql => linked_ai_patents.sql} |   1 -
 ...airtable_imports.sql => organizations.sql} |   2 -
 ...rences.sql => pubs_in_top_conferences.sql} |   2 -
 ...rence_pubs.sql => top_conference_pubs.sql} |   2 -
 21 files changed, 450 insertions(+), 60 deletions(-)
 create mode 100644 company_linkage/Dockerfile
 create mode 100644 company_linkage/parat_data_dag.py
 rename company_linkage/{ => parat_scripts}/aggregate_organizations.py (98%)
 rename company_linkage/{ => parat_scripts}/all_papers.py (100%)
 rename company_linkage/{ => parat_scripts}/deduplicate_companies.py (100%)
 rename company_linkage/{ => parat_scripts}/get_ai_counts.py (100%)
 rename company_linkage/{ => parat_scripts}/test_aggregate_organizations.py (97%)
 rename company_linkage/{ => parat_scripts}/test_ai_counts.py (98%)
 rename company_linkage/{ => parat_scripts}/top_papers.py (94%)
 mode change 100644 => 100755 company_linkage/push_to_airflow.sh
 create mode 100644 company_linkage/requirements.txt
 rename company_linkage/schemas/{aggregated_organizations_schema.json => aggregated_organizations.json} (100%)
 delete mode 100644 company_linkage/sequences.txt
 create mode 100644 company_linkage/sequences/initial_data.csv
 rename company_linkage/sql/{selecting_ai_publications.sql => ai_publications.sql} (95%)
 rename company_linkage/sql/{selecting_all_publications.sql => all_publications.sql} (100%)
 rename company_linkage/sql/{selecting_ai_patents.sql => linked_ai_patents.sql} (98%)
 rename company_linkage/sql/{create_organizations_from_airtable_imports.sql => organizations.sql} (97%)
 rename company_linkage/sql/{pulling_publications_in_top_ai_conferences.sql => pubs_in_top_conferences.sql} (93%)
 rename company_linkage/sql/{selecting_top_conference_pubs.sql => top_conference_pubs.sql} (97%)

diff --git a/company_linkage/Dockerfile b/company_linkage/Dockerfile
new file mode 100644
index 00000000..fc5ed907
--- /dev/null
+++ b/company_linkage/Dockerfile
@@ -0,0 +1,22 @@
+FROM ubuntu:20.04
+
+# Set up system dependencies
+RUN apt -y update
+RUN apt-get -y update
+RUN apt-get install -y build-essential libssl-dev libffi-dev python3-dev python3-pip curl
+
+# Grab files we need to run
+ADD requirements.txt /parat/requirements.txt
+ADD parat_scripts/* /parat/
+
+# install gsutil and put it on the path for airflow to use
+ENV CLOUDSDK_INSTALL_DIR /usr/local/gcloud/
+RUN curl -sSL https://sdk.cloud.google.com | bash
+ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin
+
+# Install python dependencies
+WORKDIR /parat
+ENV AIRFLOW_GPL_UNIDECODE=yes
+RUN pip3 install -r requirements.txt
+# Make sure the above config succeeded
+RUN python3 -m pytest test_aggregate_organizations.py -k test_add_location
\ No newline at end of file
diff --git a/company_linkage/README.md b/company_linkage/README.md
index 9dcdb1b3..47438633 100644
--- a/company_linkage/README.md
+++ b/company_linkage/README.md
@@ -16,37 +16,47 @@ run some of this code as-is.
 
 ## Tasks to build visualization data
 
-1. [creating_organizations_from_airtable_imports.sql](sql/create_organizations_from_airtable_imports.sql)
-2. [selecting_ai_publications.sql](sql/selecting_ai_publications.sql)
-3. `python3 aggregate_organizations.py aggregated_organizations.jsonl`
-4. Replace `high_resolution_entities.aggregated_organizations` with the data from `aggregated_organizations.jsonl` using the [aggregated_organizations_schema](schemas/aggregated_organizations_schema.json)
-5. [selecting_ai_patents.sql](sql/selecting_ai_patents.sql)
-6. `python3 get_ai_counts.py data/ai_company_papers.jsonl data/ai_company_patents.jsonl` 
-7. Upload `ai_company_papers.jsonl` to `ai_companies_visualization.ai_company_pubs` using the [ai_papers_schema](schemas/ai_papers_schema.json)
-8. Upload `ai_company_patents.jsonl` to `ai_companies_visualization.ai_company_patents` using the [ai_patents_schema](schemas/ai_patents_schema.json)
-9. [creating_initial_visualization_data_publications.sql](sql/creating_initial_visualization_data_publications.sql)
-10. [adding_ai_pubs_by_year_to_visualization.sql](sql/adding_ai_pubs_by_year_to_visualization.sql)
-11. [creating_patent_visualization_data.sql](sql/creating_patent_visualization_data.sql)
-12. [adding_ai_patents_by_year_to_visualization.sql](sql/adding_ai_patents_by_year_to_visualization.sql)
-13. [creating_paper_visualization_data.sql](sql/creating_paper_visualization_data.sql)
-14. [adding_top_mag_ai_fields.sql](sql/adding_top_mag_ai_fields.sql)
-15. [adding_top_science_map_clusters.sql](sql/adding_top_science_map_clusters.sql)
-16. [adding_company_references.sql](sql/adding_company_references.sql)
-17. [adding_top_tasks.sql](sql/adding_top_tasks.sql)
-18. [adding_top_methods.sql](sql/adding_top_methods.sql)
-19. [selecting_top_conference_pubs.sql](sql/selecting_top_conference_pubs.sql)
-20. [pulling_publications_in_top_ai_conferences.sql](sql/pulling_publications_in_top_ai_conferences.sql)
-21. `python3 top_papers.py top_paper_counts.jsonl`
-22. Upload `top_paper_counts.jsonl` to `ai_companies_visualization.top_paper_counts` using the [top_papers_schema](schemas/top_papers_schema.json)
-23. [adding_top_paper_counts.sql](sql/adding_top_paper_counts.sql)
-24. [selecting_all_publications.sql](sql/selecting_all_publications.sql)
-25. `python3 all_papers.py all_paper_counts.jsonl`
-26. Upload `all_paper_counts.jsonl` to `ai_companies_visualization.total_paper_counts` using the [all_papers_schema](schemas/all_papers_schema.json)
+1. [organizations.sql](sql/organizations.sql)
+2. [ai_publications.sql](sql/ai_publications.sql)
+3. [linked_ai_patents.sql](sql/linked_ai_patents.sql)
+4. [top_conference_pubs.sql](sql/top_conference_pubs.sql)
+5. [pubs_in_top_conferences.sql](sql/pubs_in_top_conferences.sql)
+6. [all_publications.sql](sql/all_publications.sql)
+7. `python3 aggregate_organizations.py aggregated_organizations.jsonl`
+8. Replace `high_resolution_entities.aggregated_organizations` with the data from `aggregated_organizations.jsonl` using the [aggregated_organizations_schema](schemas/aggregated_organizations_schema.json)
+9. `python3 get_ai_counts.py data/ai_company_papers.jsonl data/ai_company_patents.jsonl` 
+10. Upload `ai_company_papers.jsonl` to `ai_companies_visualization.ai_company_pubs` using the [ai_papers_schema](schemas/ai_papers_schema.json)
+11. Upload `ai_company_patents.jsonl` to `ai_companies_visualization.ai_company_patents` using the [ai_patents_schema](schemas/ai_patents_schema.json)
+12. `python3 top_papers.py top_paper_counts.jsonl`
+13. Upload `top_paper_counts.jsonl` to `ai_companies_visualization.top_paper_counts` using the [top_papers_schema](schemas/top_papers_schema.json)
+14. `python3 all_papers.py all_paper_counts.jsonl`
+15. Upload `all_paper_counts.jsonl` to `ai_companies_visualization.total_paper_counts` using the [all_papers_schema](schemas/all_papers_schema.json)
+16. [creating_initial_visualization_data_publications.sql](sql/creating_initial_visualization_data_publications.sql)
+17. [adding_ai_pubs_by_year_to_visualization.sql](sql/adding_ai_pubs_by_year_to_visualization.sql)
+18. [creating_patent_visualization_data.sql](sql/creating_patent_visualization_data.sql)
+19. [adding_ai_patents_by_year_to_visualization.sql](sql/adding_ai_patents_by_year_to_visualization.sql)
+20. [creating_paper_visualization_data.sql](sql/creating_paper_visualization_data.sql)
+21. [adding_top_mag_ai_fields.sql](sql/adding_top_mag_ai_fields.sql)
+22. [adding_top_science_map_clusters.sql](sql/adding_top_science_map_clusters.sql)
+23. [adding_company_references.sql](sql/adding_company_references.sql)
+24. [adding_top_tasks.sql](sql/adding_top_tasks.sql)
+25. [adding_top_methods.sql](sql/adding_top_methods.sql)
+26. [adding_top_paper_counts.sql](sql/adding_top_paper_counts.sql)
 27. [adding_all_paper_counts.sql](sql/adding_all_paper_counts.sql)
 28. [creating_workforce_visualization_data.sql](sql/creating_workforce_visualization_data.sql)
 29. [adding_ai_jobs_to_workforce_visualization.sql](sql/adding_ai_jobs_to_workforce_visualization.sql)
-31. [omit_by_rule.sql](sql/omit_by_rule.sql)
-32. [omit_by_rule_papers.sql](sql/omit_by_rule_papers.sql)
-33. [omit_by_rule_patents.sql](sql/omit_by_rule_patents.sql)
-34. [omit_by_rule_workforce.sql](sql/omit_by_rule_workforce.sql)
-35. [adding_crunchbase_company_metadata.sql](sql/adding_crunchbase_company_metadata.sql)
\ No newline at end of file
+30. [omit_by_rule.sql](sql/omit_by_rule.sql)
+31. [omit_by_rule_papers.sql](sql/omit_by_rule_papers.sql)
+32. [omit_by_rule_patents.sql](sql/omit_by_rule_patents.sql)
+33. [omit_by_rule_workforce.sql](sql/omit_by_rule_workforce.sql)
+34. [adding_crunchbase_company_metadata.sql](sql/adding_crunchbase_company_metadata.sql)
+
+# Deployment
+
+To refresh the docker container (which you must do if you change any of the python scripts in parat_scripts/), run
+
+```
+docker build -t parat .
+docker tag parat us.gcr.io/gcp-cset-projects/parat
+docker push us.gcr.io/gcp-cset-projects/parat
+```
\ No newline at end of file
diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py
new file mode 100644
index 00000000..e64c2fca
--- /dev/null
+++ b/company_linkage/parat_data_dag.py
@@ -0,0 +1,294 @@
+import os
+from datetime import datetime
+
+from airflow import DAG
+from airflow.operators.python import PythonOperator
+from airflow.operators.trigger_dagrun import TriggerDagRunOperator
+from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
+from airflow.providers.google.cloud.operators.cloud_sql import (
+    CloudSQLImportInstanceOperator,
+)
+from airflow.providers.google.cloud.operators.kubernetes_engine import GKEStartPodOperator
+from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator
+from airflow.operators.dummy import DummyOperator
+from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
+from airflow.providers.google.cloud.transfers.bigquery_to_gcs import (
+    BigQueryToGCSOperator,
+)
+from dataloader.airflow_utils.defaults import (
+    DATA_BUCKET,
+    PROJECT_ID,
+    GCP_ZONE,
+    get_default_args,
+    get_post_success,
+)
+from dataloader.scripts.populate_documentation import update_table_descriptions
+from parat_scripts.aggregate_organizations import aggregate_organizations
+
+bucket = DATA_BUCKET
+initial_dataset = "parat_input"
+intermediate_dataset = "high_resolution_entities"
+production_dataset = "ai_companies_visualization"
+staging_intermediate_dataset = f"staging_{intermediate_dataset}"
+staging_dataset = f"staging_{production_dataset}"
+sql_dir = "sql/parat"
+schema_dir = "schemas/parat"
+tmp_dir = f"{production_dataset}/tmp"
+
+default_args = get_default_args()
+date = datetime.now().strftime("%Y%m%d")
+
+
+# Part 2: Get data from airtable and update databases
+dag = DAG(
+    "parat",
+    default_args=default_args,
+    description="PARAT data updater",
+    schedule_interval=None,
+    catchup=False,
+    user_defined_macros={
+        "staging_dataset": staging_dataset,
+        "production_dataset": production_dataset,
+        "staging_intermediate_dataset": staging_intermediate_dataset,
+        "intermediate_dataset": intermediate_dataset,
+        "initial_dataset": initial_dataset
+    },
+)
+with dag:
+
+    clear_tmp_dir = GCSDeleteObjectsOperator(
+        task_id="clear_tmp_dir",
+        bucket_name=DATA_BUCKET,
+        prefix=tmp_dir
+    )
+
+    # combine all the airtable tables into joined tables
+
+    start = DummyOperator(task_id="starting")
+
+    join_tables = []
+    for table in ["alias", "grid", "ids", "linkedin", "market", "organizations", "parent", "permid"]:
+
+        # Grab all the data and write it to unseen_en_corpus
+        join_table = BigQueryInsertJobOperator(
+            task_id=f"join_{table}",
+            configuration={
+                "query": {
+                    "query": f"select distinct * from {initial_dataset}.{table}_preannotation UNION DISTINCT "
+                             f"select distinct * from {initial_dataset}.{table}_validate",
+                    "useLegacySql": False,
+                    "destinationTable": {
+                        "projectId": PROJECT_ID,
+                        "datasetId": initial_dataset,
+                        "tableId": f"{table}_joined"
+                    },
+                    "allowLargeResults": True,
+                    "createDisposition": "CREATE_IF_NEEDED",
+                    "writeDisposition": "WRITE_TRUNCATE"
+                }
+            }
+        )
+        join_tables.append(join_table)
+
+    # Do initial query sequence
+
+    start_initial_tables = DummyOperator(task_id="start_initial_tables")
+
+    wait_for_initial_tables = DummyOperator(task_id="wait_for_initial_tables")
+
+    seq_path_prefix = f"{os.environ.get('DAGS_FOLDER')}/sequences/parat/"
+    initial_query_sequence = "initial_data.csv"
+
+    curr = start_initial_tables
+    for line in open(seq_path_prefix + initial_query_sequence).readlines():
+        dataset, table = line.split(",")
+        staging_table_name = f"staging_{dataset}.{table.strip()}"
+        next = BigQueryInsertJobOperator(
+            task_id="create_"+staging_table_name,
+            configuration={
+                "query": {
+                    "query": "{% include '" + f"{sql_dir}/{table.strip()}.sql" + "' %}",
+                    "useLegacySql": False,
+                    "destinationTable": {
+                        "projectId": PROJECT_ID,
+                        "datasetId": staging_dataset,
+                        "tableId": table
+                    },
+                    "allowLargeResults": True,
+                    "createDisposition": "CREATE_IF_NEEDED",
+                    "writeDisposition": "WRITE_TRUNCATE"
+                }
+            },
+        )
+        curr >> next
+        curr = next
+    curr >> wait_for_initial_tables
+
+    # run aggregate_organizations python and load to GCS
+    aggregated_table = "aggregated_organizations"
+
+    aggregate_organizations = PythonOperator(
+        task_id="aggregate_organizations",
+        op_kwargs={
+            "output_file": f"{aggregated_table}.jsonl"
+        },
+        python_callable=aggregate_organizations,
+    )
+
+    # load aggregated_organizations to BigQuery
+
+    load_aggregated_orgs = GCSToBigQueryOperator(
+        task_id=f"load_{aggregated_table}",
+        bucket=DATA_BUCKET,
+        source_objects=[f"{aggregated_table}.jsonl"],
+        schema_object=f"{schema_dir}/{aggregated_table}.json",
+        destination_project_dataset_table=f"{staging_intermediate_dataset}.{aggregated_table}",
+        source_format="NEWLINE_DELIMITED_JSON",
+        create_disposition="CREATE_IF_NEEDED",
+        write_disposition="WRITE_TRUNCATE"
+    )
+
+    # TODO: somewhere in here we need to decide whether to load directly to the main table
+    # or to add a transfer step to transfer from staging to the main table; if the latter
+    # are there checks we want to add first?
+    # for now, pretend the data is in the main table already
+
+    run_get_ai_counts = GKEStartPodOperator(
+        task_id="run_get_ai_counts",
+        project_id=PROJECT_ID,
+        location=GCP_ZONE,
+        cluster_name="us-east1-production2023-cc1-01d75926-gke",
+        name="run_get_ai_counts",
+        cmds=["/bin/bash"],
+        arguments=["-c", (f"echo 'getting AI counts!' ; rm -r ai || true ; "
+                          f"mkdir -p ai && "
+                          f"python3 get_ai_counts.py ai/ai_company_papers.jsonl ai/ai_company_patents.jsonl"
+                          f"gsutil -m cp -r ai gs://{DATA_BUCKET}/{tmp_dir}/ ")],
+        namespace="default",
+        image=f"us.gcr.io/{PROJECT_ID}/parat",
+        get_logs=True,
+        startup_timeout_seconds=300,
+        # see also https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator#affinity-config
+        affinity={
+            "nodeAffinity": {
+                "requiredDuringSchedulingIgnoredDuringExecution": {
+                    "nodeSelectorTerms": [{
+                        "matchExpressions": [{
+                            "key": "cloud.google.com/gke-nodepool",
+                            "operator": "In",
+                            "values": [
+                                "default-pool",
+                            ]
+                        }]
+                    }]
+                }
+            }
+        }
+    )
+
+    load_ai_papers = GCSToBigQueryOperator(
+        task_id=f"load_ai_company_papers",
+        bucket=DATA_BUCKET,
+        source_objects=["ai_company_papers.jsonl"],
+        schema_object=f"{schema_dir}/ai_papers_schema.json",
+        destination_project_dataset_table=f"{staging_intermediate_dataset}.ai_company_papers",
+        source_format="NEWLINE_DELIMITED_JSON",
+        create_disposition="CREATE_IF_NEEDED",
+        write_disposition="WRITE_TRUNCATE"
+    )
+
+    load_ai_patents = GCSToBigQueryOperator(
+        task_id=f"load_ai_company_patents",
+        bucket=DATA_BUCKET,
+        source_objects=["ai_company_patents.jsonl"],
+        schema_object=f"{schema_dir}/ai_patents_schema.json",
+        destination_project_dataset_table=f"{staging_intermediate_dataset}.ai_company_patents",
+        source_format="NEWLINE_DELIMITED_JSON",
+        create_disposition="CREATE_IF_NEEDED",
+        write_disposition="WRITE_TRUNCATE"
+    )
+
+    run_papers = []
+    for paper_type in ["top", "all"]:
+
+        run_get_paper_counts = GKEStartPodOperator(
+            task_id=f"run_get_{paper_type}_counts",
+            project_id=PROJECT_ID,
+            location=GCP_ZONE,
+            cluster_name="us-east1-production2023-cc1-01d75926-gke",
+            name=f"run_get_{paper_type}_counts",
+            cmds=["/bin/bash"],
+            arguments=["-c", (f"echo 'getting {paper_type} paper counts!' ; rm -r {paper_type} || true ; "
+                              f"mkdir -p {paper_type} && "
+                              f"python3 {paper_type}_papers.py {paper_type}/{paper_type}_paper_counts.jsonl"
+                              f"gsutil -m cp -r {paper_type} gs://{DATA_BUCKET}/{tmp_dir}/ ")],
+            namespace="default",
+            image=f"us.gcr.io/{PROJECT_ID}/parat",
+            get_logs=True,
+            startup_timeout_seconds=300,
+            # see also https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator#affinity-config
+            affinity={
+                "nodeAffinity": {
+                    "requiredDuringSchedulingIgnoredDuringExecution": {
+                        "nodeSelectorTerms": [{
+                            "matchExpressions": [{
+                                "key": "cloud.google.com/gke-nodepool",
+                                "operator": "In",
+                                "values": [
+                                    "default-pool",
+                                ]
+                            }]
+                        }]
+                    }
+                }
+            }
+        )
+        run_papers.append(run_get_paper_counts)
+
+    # even though these are near-identical we do these in sequence -- we'd have to put in a dummy operator
+    # otherwise anyway and they should be fast
+
+    load_top_papers = GCSToBigQueryOperator(
+        task_id=f"load_top_papers",
+        bucket=DATA_BUCKET,
+        source_objects=["top_paper_counts.jsonl"],
+        schema_object=f"{schema_dir}/top_papers_schema.json",
+        destination_project_dataset_table=f"{staging_intermediate_dataset}.top_paper_counts",
+        source_format="NEWLINE_DELIMITED_JSON",
+        create_disposition="CREATE_IF_NEEDED",
+        write_disposition="WRITE_TRUNCATE"
+    )
+
+    load_all_papers = GCSToBigQueryOperator(
+        task_id=f"load_all_papers",
+        bucket=DATA_BUCKET,
+        source_objects=["all_paper_counts.jsonl"],
+        schema_object=f"{schema_dir}/all_papers_schema.json",
+        destination_project_dataset_table=f"{staging_intermediate_dataset}.all_paper_counts",
+        source_format="NEWLINE_DELIMITED_JSON",
+        create_disposition="CREATE_IF_NEEDED",
+        write_disposition="WRITE_TRUNCATE"
+    )
+
+
+
+
+
+    (
+        clear_tmp_dir
+        >> start
+        >> join_tables
+        >> start_initial_tables
+    )
+    (
+        wait_for_initial_tables
+        >> aggregate_organizations
+        >> load_aggregated_orgs
+        >> run_get_ai_counts
+        >> load_ai_papers
+        >> load_ai_patents
+        >> run_papers
+        >> load_top_papers
+        >> load_all_papers
+    )
+
diff --git a/company_linkage/aggregate_organizations.py b/company_linkage/parat_scripts/aggregate_organizations.py
similarity index 98%
rename from company_linkage/aggregate_organizations.py
rename to company_linkage/parat_scripts/aggregate_organizations.py
index 0029ed62..424ea404 100644
--- a/company_linkage/aggregate_organizations.py
+++ b/company_linkage/parat_scripts/aggregate_organizations.py
@@ -2,6 +2,7 @@
 from google.cloud import bigquery
 import json
 from collections import defaultdict
+import subprocess
 
 # List of companies not being aggregated
 # note: check https://docs.google.com/spreadsheets/d/1Tq28O8qIA6T3AJ5oTHKCcscaNZsY_E4OPOUm6JaiwWA/edit#gid=0
@@ -394,7 +395,7 @@ def update_organization_data(self, org, org_id):
         org_info.add_sandp(org["in_sandp_500"])
         org_info.add_fortune(org["in_fortune_global_500"])
 
-    def print_output(self, output_file):
+    def print_output(self, output_file, local):
         """
         Writing the aggregated organization output to file
         :param output_file: The output file we're writing to
@@ -414,18 +415,22 @@ def print_output(self, output_file):
                   "non_agg_children": org_info.non_agg_children}
             out.write(json.dumps(js, ensure_ascii=False) + "\n")
         out.close()
+        if not local:
+            subprocess.run(["gsutil", "-m", "cp", "-r", output_file, "gs://parat/"], check=True)
 
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("output_file", type=str, help="A jsonl file for writing output data to create new tables")
-    args = parser.parse_args()
-    if not args.output_file.endswith(".jsonl"):
-        parser.print_help()
+
+def aggregate_organizations(output_file, local=False):
     aggregator = OrganizationAggregator()
     aggregator.get_parents()
     aggregator.get_organizations()
-    aggregator.print_output(args.output_file)
+    aggregator.print_output(output_file, local)
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("output_file", type=str, help="A jsonl file for writing output data to create new tables")
+    args = parser.parse_args()
+    if not args.output_file.endswith(".jsonl"):
+        parser.print_help()
+    aggregate_organizations(args.output_file, local=True)
+
diff --git a/company_linkage/all_papers.py b/company_linkage/parat_scripts/all_papers.py
similarity index 100%
rename from company_linkage/all_papers.py
rename to company_linkage/parat_scripts/all_papers.py
diff --git a/company_linkage/deduplicate_companies.py b/company_linkage/parat_scripts/deduplicate_companies.py
similarity index 100%
rename from company_linkage/deduplicate_companies.py
rename to company_linkage/parat_scripts/deduplicate_companies.py
diff --git a/company_linkage/get_ai_counts.py b/company_linkage/parat_scripts/get_ai_counts.py
similarity index 100%
rename from company_linkage/get_ai_counts.py
rename to company_linkage/parat_scripts/get_ai_counts.py
diff --git a/company_linkage/test_aggregate_organizations.py b/company_linkage/parat_scripts/test_aggregate_organizations.py
similarity index 97%
rename from company_linkage/test_aggregate_organizations.py
rename to company_linkage/parat_scripts/test_aggregate_organizations.py
index 76709ac6..c0287b21 100644
--- a/company_linkage/test_aggregate_organizations.py
+++ b/company_linkage/parat_scripts/test_aggregate_organizations.py
@@ -1,6 +1,6 @@
 import os
 import unittest
-from company_linkage import aggregate_organizations
+import aggregate_organizations
 from collections import defaultdict
 
 
@@ -131,15 +131,15 @@ def test_add_grid(self):
 
     def test_add_regex(self):
         org = aggregate_organizations.Organization(1, "test")
-        org.add_regex("^hhi\s+corporation$|^hhi$|^hhi\s+corp$")
-        self.assertEqual(org.regex[0], "^hhi\s+corporation$|^hhi$|^hhi\s+corp$")
+        org.add_regex(r"^hhi\s+corporation$|^hhi$|^hhi\s+corp$")
+        self.assertEqual(org.regex[0], r"^hhi\s+corporation$|^hhi$|^hhi\s+corp$")
         self.assertEqual(len(org.regex), 1)
         # Don't add a duplicate entry!
-        org.add_regex("^hhi\s+corporation$|^hhi$|^hhi\s+corp$")
+        org.add_regex(r"^hhi\s+corporation$|^hhi$|^hhi\s+corp$")
         self.assertEqual(len(org.regex), 1)
         # Do add a new one
-        org.add_regex("^hhi\s+corporation$")
-        self.assertEqual(org.regex[1], "^hhi\s+corporation$")
+        org.add_regex(r"^hhi\s+corporation$")
+        self.assertEqual(org.regex[1], r"^hhi\s+corporation$")
         self.assertEqual(len(org.regex), 2)
 
     def test_add_bgov_id(self):
diff --git a/company_linkage/test_ai_counts.py b/company_linkage/parat_scripts/test_ai_counts.py
similarity index 98%
rename from company_linkage/test_ai_counts.py
rename to company_linkage/parat_scripts/test_ai_counts.py
index 83bf622b..3845ffff 100644
--- a/company_linkage/test_ai_counts.py
+++ b/company_linkage/parat_scripts/test_ai_counts.py
@@ -1,5 +1,5 @@
 import unittest
-from company_linkage.get_ai_counts import CountGetter
+from get_ai_counts import CountGetter
 import warnings
 
 
diff --git a/company_linkage/top_papers.py b/company_linkage/parat_scripts/top_papers.py
similarity index 94%
rename from company_linkage/top_papers.py
rename to company_linkage/parat_scripts/top_papers.py
index 962bcf97..a11da6af 100644
--- a/company_linkage/top_papers.py
+++ b/company_linkage/parat_scripts/top_papers.py
@@ -1,6 +1,6 @@
 import argparse
 
-from company_linkage.get_ai_counts import CountGetter
+from get_ai_counts import CountGetter
 
 
 def main() -> None:
diff --git a/company_linkage/push_to_airflow.sh b/company_linkage/push_to_airflow.sh
old mode 100644
new mode 100755
index ca988b32..7d3813b7
--- a/company_linkage/push_to_airflow.sh
+++ b/company_linkage/push_to_airflow.sh
@@ -3,10 +3,16 @@ gsutil rm -r gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/airtable_to_bq_confi
 
 gsutil cp -r airtable_configs/parat_preannotation gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/airtable_to_bq_config/
 gsutil cp -r airtable_configs/parat_validate gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/airtable_to_bq_config/
-gsutil cp airtable_queries/parat_preannotation/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/bq_to_airtable/parat_preannotation/
-gsutil cp airtable_queries/parat_preannotation/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/bq_to_airtable/parat_validate/
 
 gsutil cp airtable_queries/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/airtable_to_bq/parat_preannotation/
 gsutil cp airtable_queries/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/airtable_to_bq/parat_validate/
 gsutil cp airtable_schemas/parat_preannotation/* gs://airflow-data-exchange-development/schemas/airtable_to_bq/parat_preannotation/
-gsutil cp airtable_schemas/parat_validate/* gs://airflow-data-exchange-development/schemas/airtable_to_bq/parat_validate/
\ No newline at end of file
+gsutil cp airtable_schemas/parat_validate/* gs://airflow-data-exchange-development/schemas/airtable_to_bq/parat_validate/
+
+gsutil cp parat_data_dag.py gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/
+gsutil cp aggregate_organizations.py gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/
+gsutil cp sequences/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sequences/parat/
+gsutil rm gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/parat/*
+gsutil cp sql/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/parat/
+gsutil cp schemas/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/schemas/parat/
+gsutil -m cp -r parat_scripts/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/parat_scripts/
\ No newline at end of file
diff --git a/company_linkage/requirements.txt b/company_linkage/requirements.txt
new file mode 100644
index 00000000..10b51281
--- /dev/null
+++ b/company_linkage/requirements.txt
@@ -0,0 +1,56 @@
+attrs==21.2.0
+cachetools==4.1.1
+certifi==2020.6.20
+cffi==1.14.3
+chardet==3.0.4
+coverage==5.5
+google-api-core==1.30.0
+google-auth==1.30.2
+google-auth-oauthlib==0.4.4
+google-cloud-bigquery==2.20.0
+google-cloud-bigquery-storage==2.4.0
+google-cloud-core==1.6.0
+google-cloud-translate==3.2.0
+google-crc32c==1.1.2
+google-resumable-media==1.3.0
+googleapis-common-protos==1.53.0
+grpcio==1.33.1
+idna==2.10
+iniconfig==1.1.1
+libcst==0.3.13
+mypy-extensions==0.4.3
+numpy==1.20.3
+oauthlib==3.1.0
+packaging==20.9
+pandas==1.1.3
+pandas-gbq==0.14.0
+Pillow==8.2.0
+pluggy==0.13.1
+pprintpp==0.4.0
+proto-plus==1.11.0
+protobuf==3.13.0
+py==1.10.0
+pyarrow==3.0.0
+pyasn1==0.4.8
+pyasn1-modules==0.2.8
+pycld2==0.41
+pycountry==20.7.3
+pycountry-convert==0.7.2
+pycparser==2.20
+pydata-google-auth==1.1.0
+pyparsing==2.4.7
+pytest==6.2.4
+pytest-cov==2.12.1
+pytest-mock==3.6.1
+python-dateutil==2.8.1
+pytz==2020.1
+PyYAML==5.3.1
+repoze.lru==0.7
+requests==2.24.0
+requests-oauthlib==1.3.0
+rsa==4.6
+six==1.15.0
+toml==0.10.2
+typing-extensions==3.7.4.3
+typing-inspect==0.6.0
+urllib3==1.25.11
diff --git a/company_linkage/schemas/aggregated_organizations_schema.json b/company_linkage/schemas/aggregated_organizations.json
similarity index 100%
rename from company_linkage/schemas/aggregated_organizations_schema.json
rename to company_linkage/schemas/aggregated_organizations.json
diff --git a/company_linkage/sequences.txt b/company_linkage/sequences.txt
deleted file mode 100644
index e69de29b..00000000
diff --git a/company_linkage/sequences/initial_data.csv b/company_linkage/sequences/initial_data.csv
new file mode 100644
index 00000000..a6e65b82
--- /dev/null
+++ b/company_linkage/sequences/initial_data.csv
@@ -0,0 +1,6 @@
+high_resolution_entities,organizations
+ai_companies_visualization,ai_publications
+ai_companies_visualization,linked_ai_patents
+ai_companies_visualization,top_conference_pubs
+ai_companies_visualization,pubs_in_top_conferences
+ai_companies_visualization,all_publications
\ No newline at end of file
diff --git a/company_linkage/sql/selecting_ai_publications.sql b/company_linkage/sql/ai_publications.sql
similarity index 95%
rename from company_linkage/sql/selecting_ai_publications.sql
rename to company_linkage/sql/ai_publications.sql
index 6877d104..75a0b0f6 100644
--- a/company_linkage/sql/selecting_ai_publications.sql
+++ b/company_linkage/sql/ai_publications.sql
@@ -1,8 +1,6 @@
   -- Pulling every AI-associated publication id linked to every grid id and every organization name
   -- We also include years because we'll want those later for yearly counts
   -- and cv/robotics/nlp so we can filter on these
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.ai_publications AS
 WITH
   ai_papers AS (
   SELECT
diff --git a/company_linkage/sql/selecting_all_publications.sql b/company_linkage/sql/all_publications.sql
similarity index 100%
rename from company_linkage/sql/selecting_all_publications.sql
rename to company_linkage/sql/all_publications.sql
diff --git a/company_linkage/sql/selecting_ai_patents.sql b/company_linkage/sql/linked_ai_patents.sql
similarity index 98%
rename from company_linkage/sql/selecting_ai_patents.sql
rename to company_linkage/sql/linked_ai_patents.sql
index e5db1fff..e667c310 100644
--- a/company_linkage/sql/selecting_ai_patents.sql
+++ b/company_linkage/sql/linked_ai_patents.sql
@@ -1,7 +1,6 @@
 -- Pulling every AI-associated patent family id linked to every grid id of any assignee for that patent, and all the assignee names
 -- We also pull in the AI subcategories and the years
 -- We also attempt to add in "fake" families for the patents that are missing patent families
-create or replace table ai_companies_visualization.linked_ai_patents as
 with patents_orig as (
 SELECT
   -- Pulling in the current assignee grid ids from dimensions
diff --git a/company_linkage/sql/create_organizations_from_airtable_imports.sql b/company_linkage/sql/organizations.sql
similarity index 97%
rename from company_linkage/sql/create_organizations_from_airtable_imports.sql
rename to company_linkage/sql/organizations.sql
index 0fa42f1d..60f875d5 100644
--- a/company_linkage/sql/create_organizations_from_airtable_imports.sql
+++ b/company_linkage/sql/organizations.sql
@@ -1,5 +1,3 @@
-CREATE OR REPLACE TABLE
-  high_resolution_entities.organizations AS
 SELECT
   * REPLACE( (
     SELECT
diff --git a/company_linkage/sql/pulling_publications_in_top_ai_conferences.sql b/company_linkage/sql/pubs_in_top_conferences.sql
similarity index 93%
rename from company_linkage/sql/pulling_publications_in_top_ai_conferences.sql
rename to company_linkage/sql/pubs_in_top_conferences.sql
index 08487408..cc1bee22 100644
--- a/company_linkage/sql/pulling_publications_in_top_ai_conferences.sql
+++ b/company_linkage/sql/pubs_in_top_conferences.sql
@@ -1,5 +1,3 @@
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.pubs_in_top_conferences AS
 WITH
   -- Associating GRIDs to the merged paper ids
   affils AS (
diff --git a/company_linkage/sql/selecting_top_conference_pubs.sql b/company_linkage/sql/top_conference_pubs.sql
similarity index 97%
rename from company_linkage/sql/selecting_top_conference_pubs.sql
rename to company_linkage/sql/top_conference_pubs.sql
index af563615..25993325 100644
--- a/company_linkage/sql/selecting_top_conference_pubs.sql
+++ b/company_linkage/sql/top_conference_pubs.sql
@@ -1,5 +1,3 @@
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.top_conference_pubs AS
 WITH
   venues AS (
 SELECT

From 5127270e49d92efae458c07ef7843c3519c2715b Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Thu, 28 Sep 2023 12:49:54 -0400
Subject: [PATCH 02/17] Update queries needed pre-python scripts for great
 update

---
 company_linkage/sql/ai_publications.sql   | 32 +++++++++++------------
 company_linkage/sql/linked_ai_patents.sql | 14 +++++-----
 company_linkage/sql/organizations.sql     | 14 ++++++----
 3 files changed, 32 insertions(+), 28 deletions(-)

diff --git a/company_linkage/sql/ai_publications.sql b/company_linkage/sql/ai_publications.sql
index 75a0b0f6..2aacb37f 100644
--- a/company_linkage/sql/ai_publications.sql
+++ b/company_linkage/sql/ai_publications.sql
@@ -12,26 +12,26 @@ WITH
     gcp-cset-projects.article_classification.predictions
   WHERE
     ai_filtered = TRUE OR cv_filtered = TRUE OR nlp_filtered = TRUE OR robotics_filtered = TRUE),
-  gr AS (
-    -- Adding in org names and country data using GRID
+  ror AS (
+    -- Adding in org names and country data using ROR
   SELECT
     id,
     name AS org_name,
-    country_name AS country
+    country.country_name AS country
   FROM
-    gcp-cset-projects.gcp_cset_grid.api_grid),
-  merged_grids AS (
-    -- Selecting all the merged ids and grid ids from the links table
+    gcp_cset_ror.ror),
+  merged_rors AS (
+    -- Selecting all the merged ids and ror ids from the literature table
   SELECT
     DISTINCT
     merged_id,
-    grid_id,
+    ror_id,
     org_name,
     cv_filtered as cv,
     nlp_filtered as nlp,
     robotics_filtered as robotics
   FROM
-    `gcp-cset-projects.gcp_cset_links_v2.paper_affiliations_merged`
+    literature.affiliations
     -- if they're AI papers
   INNER JOIN ai_papers
     USING (merged_id)),
@@ -40,20 +40,20 @@ WITH
     merged_id,
     year
   FROM
-    `gcp-cset-projects.gcp_cset_links_v2.corpus_merged`)
+    literature.papers)
 SELECT
-  -- Adding in the org name and country associated with the grid id
-  merged_grids.* EXCEPT (org_name),
-  COALESCE(gr.org_name, merged_grids.org_name) as org_name,
+  -- Adding in the org name and country associated with the ror id
+  merged_rors.* EXCEPT (org_name),
+  COALESCE(ror.org_name, merged_rors.org_name) as org_name,
   country,
   year
 FROM
-  merged_grids
+  merged_rors
 LEFT JOIN
-  gr
+  ror
 ON
-  merged_grids.Grid_ID = gr.id
+  merged_rors.ror_id = ror.id
 LEFT JOIN
   article_years
 ON
-  merged_grids.merged_id = article_years.merged_id
\ No newline at end of file
+  merged_rors.merged_id = article_years.merged_id
\ No newline at end of file
diff --git a/company_linkage/sql/linked_ai_patents.sql b/company_linkage/sql/linked_ai_patents.sql
index e667c310..ee7360b2 100644
--- a/company_linkage/sql/linked_ai_patents.sql
+++ b/company_linkage/sql/linked_ai_patents.sql
@@ -3,13 +3,13 @@
 -- We also attempt to add in "fake" families for the patents that are missing patent families
 with patents_orig as (
 SELECT
-  -- Pulling in the current assignee grid ids from dimensions
+  -- Pulling in the current assignee ror ids from dimensions
   patent_id,
   family_id,
   assignee,
-  grid
+  ror_id
 FROM
-  `gcp-cset-projects.unified_patents.normalized_patent_assignees`),
+  unified_patents.assignees_normalized),
 all_ai as (
   -- Selecting all the family ids and patent IDs to get AI patents
   -- Also select the year so we can get counts by year
@@ -50,13 +50,13 @@ all_ai as (
       Machine_Learning,
       Search_Methods
     FROM
-      gcp-cset-projects.unified_patents.ai_patents),
+      unified_patents.ai_patents),
   patent_years as (
   SELECT
       patent_id,
       EXTRACT(year FROM first_priority_date) as priority_year
     FROM
-      gcp-cset-projects.unified_patents.patent_dates
+      unified_patents.dates
   )
   SELECT
     DISTINCT
@@ -65,7 +65,7 @@ all_ai as (
     -- We're just doing this so our counts aren't blank
     COALESCE(family_id, "X-" || patent_id) as family_id,
     assignee,
-    grid,
+    ror_id,
     MIN(priority_year) as priority_year,
     LOGICAL_OR(Physical_Sciences_and_Engineering) as Physical_Sciences_and_Engineering,
     LOGICAL_OR(Life_Sciences) as Life_Sciences,
@@ -110,6 +110,6 @@ all_ai as (
         USING (patent_id))
   WHERE priority_year IS NOT NULL
   GROUP BY
-    grid,
+    ror_id,
     assignee,
     family_id
\ No newline at end of file
diff --git a/company_linkage/sql/organizations.sql b/company_linkage/sql/organizations.sql
index 60f875d5..b93bf8b1 100644
--- a/company_linkage/sql/organizations.sql
+++ b/company_linkage/sql/organizations.sql
@@ -35,7 +35,7 @@ FROM (
     organizations_joined.name,
     STRUCT(city,
       province_state,
-      country) AS location,
+      organizations_joined.country) AS location,
     website,
     ARRAY_AGG(STRUCT(alias_language,
         alias)) AS aliases,
@@ -52,9 +52,9 @@ FROM (
         ticker)) AS market,
     STRUCT(crunchbase_uuid,
       crunchbase_url) AS crunchbase,
-    ARRAY_AGG(DISTINCT grid IGNORE NULLS) AS grid,
+    ARRAY_AGG(DISTINCT ror.id IGNORE NULLS) AS ror_id,
     regex,
-    ARRAY_AGG(DISTINCT bgov IGNORE NULLS) AS BGOV_id,
+    ARRAY_AGG(DISTINCT bgov_id IGNORE NULLS) AS BGOV_id,
     linkedin,
     CASE
       WHEN in_sandp_500 IS TRUE THEN TRUE
@@ -68,7 +68,7 @@ FROM (
     FALSE
   END
     AS in_fortune_global_500,
-    comment
+    ids_joined.comment
   FROM
     parat_input.organizations_joined
   LEFT JOIN
@@ -103,12 +103,16 @@ FROM (
     parat_input.linkedin_joined
   USING
     (CSET_id)
+  LEFT JOIN
+    gcp_cset_ror.ror
+  ON
+    grid_joined.grid = external_ids.GRID.all
   GROUP BY
     CSET_id,
     name,
     city,
     province_state,
-    country,
+    organizations_joined.country,
     website,
     crunchbase_uuid,
     crunchbase_url,

From 770612b0e07301da939f7c736f80624cf35202c9 Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Tue, 10 Oct 2023 14:53:52 -0400
Subject: [PATCH 03/17] Get organization aggregation working; update staging

---
 company_linkage/parat_data_dag.py             | 36 +++++++++----------
 .../parat_scripts/aggregate_organizations.py  | 20 +++++------
 .../test_aggregate_organizations.py           | 24 ++++++-------
 company_linkage/push_to_airflow.sh            |  2 ++
 .../schemas/aggregated_organizations.json     |  4 +--
 company_linkage/sequences/initial_data.csv    | 10 +++---
 .../sequences/visualization_data.csv          |  1 +
 company_linkage/sql/all_publications.sql      |  4 +--
 .../sql/pubs_in_top_conferences.sql           |  2 +-
 9 files changed, 51 insertions(+), 52 deletions(-)
 create mode 100644 company_linkage/sequences/visualization_data.csv

diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py
index e64c2fca..2416ff86 100644
--- a/company_linkage/parat_data_dag.py
+++ b/company_linkage/parat_data_dag.py
@@ -29,10 +29,9 @@
 initial_dataset = "parat_input"
 intermediate_dataset = "high_resolution_entities"
 production_dataset = "ai_companies_visualization"
-staging_intermediate_dataset = f"staging_{intermediate_dataset}"
 staging_dataset = f"staging_{production_dataset}"
 sql_dir = "sql/parat"
-schema_dir = "schemas/parat"
+schema_dir = "parat/schemas"
 tmp_dir = f"{production_dataset}/tmp"
 
 default_args = get_default_args()
@@ -49,7 +48,6 @@
     user_defined_macros={
         "staging_dataset": staging_dataset,
         "production_dataset": production_dataset,
-        "staging_intermediate_dataset": staging_intermediate_dataset,
         "intermediate_dataset": intermediate_dataset,
         "initial_dataset": initial_dataset
     },
@@ -102,16 +100,16 @@
     curr = start_initial_tables
     for line in open(seq_path_prefix + initial_query_sequence).readlines():
         dataset, table = line.split(",")
-        staging_table_name = f"staging_{dataset}.{table.strip()}"
-        next = BigQueryInsertJobOperator(
-            task_id="create_"+staging_table_name,
+        table_name = f"{dataset}.{table.strip()}"
+        next_tab = BigQueryInsertJobOperator(
+            task_id=f"create_{table_name}",
             configuration={
                 "query": {
                     "query": "{% include '" + f"{sql_dir}/{table.strip()}.sql" + "' %}",
                     "useLegacySql": False,
                     "destinationTable": {
                         "projectId": PROJECT_ID,
-                        "datasetId": staging_dataset,
+                        "datasetId": dataset,
                         "tableId": table
                     },
                     "allowLargeResults": True,
@@ -120,8 +118,8 @@
                 }
             },
         )
-        curr >> next
-        curr = next
+        curr >> next_tab
+        curr = next_tab
     curr >> wait_for_initial_tables
 
     # run aggregate_organizations python and load to GCS
@@ -140,9 +138,9 @@
     load_aggregated_orgs = GCSToBigQueryOperator(
         task_id=f"load_{aggregated_table}",
         bucket=DATA_BUCKET,
-        source_objects=[f"{aggregated_table}.jsonl"],
+        source_objects=[f"{tmp_dir}/{aggregated_table}.jsonl"],
         schema_object=f"{schema_dir}/{aggregated_table}.json",
-        destination_project_dataset_table=f"{staging_intermediate_dataset}.{aggregated_table}",
+        destination_project_dataset_table=f"{intermediate_dataset}.{aggregated_table}",
         source_format="NEWLINE_DELIMITED_JSON",
         create_disposition="CREATE_IF_NEEDED",
         write_disposition="WRITE_TRUNCATE"
@@ -189,9 +187,9 @@
     load_ai_papers = GCSToBigQueryOperator(
         task_id=f"load_ai_company_papers",
         bucket=DATA_BUCKET,
-        source_objects=["ai_company_papers.jsonl"],
+        source_objects=[f"{tmp_dir}/ai_company_papers.jsonl"],
         schema_object=f"{schema_dir}/ai_papers_schema.json",
-        destination_project_dataset_table=f"{staging_intermediate_dataset}.ai_company_papers",
+        destination_project_dataset_table=f"{staging_dataset}.ai_company_papers",
         source_format="NEWLINE_DELIMITED_JSON",
         create_disposition="CREATE_IF_NEEDED",
         write_disposition="WRITE_TRUNCATE"
@@ -200,9 +198,9 @@
     load_ai_patents = GCSToBigQueryOperator(
         task_id=f"load_ai_company_patents",
         bucket=DATA_BUCKET,
-        source_objects=["ai_company_patents.jsonl"],
+        source_objects=[f"{tmp_dir}/ai_company_patents.jsonl"],
         schema_object=f"{schema_dir}/ai_patents_schema.json",
-        destination_project_dataset_table=f"{staging_intermediate_dataset}.ai_company_patents",
+        destination_project_dataset_table=f"{staging_dataset}.ai_company_patents",
         source_format="NEWLINE_DELIMITED_JSON",
         create_disposition="CREATE_IF_NEEDED",
         write_disposition="WRITE_TRUNCATE"
@@ -251,9 +249,9 @@
     load_top_papers = GCSToBigQueryOperator(
         task_id=f"load_top_papers",
         bucket=DATA_BUCKET,
-        source_objects=["top_paper_counts.jsonl"],
+        source_objects=[f"{tmp_dir}/top_paper_counts.jsonl"],
         schema_object=f"{schema_dir}/top_papers_schema.json",
-        destination_project_dataset_table=f"{staging_intermediate_dataset}.top_paper_counts",
+        destination_project_dataset_table=f"{staging_dataset}.top_paper_counts",
         source_format="NEWLINE_DELIMITED_JSON",
         create_disposition="CREATE_IF_NEEDED",
         write_disposition="WRITE_TRUNCATE"
@@ -262,9 +260,9 @@
     load_all_papers = GCSToBigQueryOperator(
         task_id=f"load_all_papers",
         bucket=DATA_BUCKET,
-        source_objects=["all_paper_counts.jsonl"],
+        source_objects=[f"{tmp_dir}/all_paper_counts.jsonl"],
         schema_object=f"{schema_dir}/all_papers_schema.json",
-        destination_project_dataset_table=f"{staging_intermediate_dataset}.all_paper_counts",
+        destination_project_dataset_table=f"{staging_dataset}.all_paper_counts",
         source_format="NEWLINE_DELIMITED_JSON",
         create_disposition="CREATE_IF_NEEDED",
         write_disposition="WRITE_TRUNCATE"
diff --git a/company_linkage/parat_scripts/aggregate_organizations.py b/company_linkage/parat_scripts/aggregate_organizations.py
index 424ea404..43dcbddc 100644
--- a/company_linkage/parat_scripts/aggregate_organizations.py
+++ b/company_linkage/parat_scripts/aggregate_organizations.py
@@ -23,7 +23,7 @@ def __init__(self, cset_id, name):
         self.market = []
         self.crunchbase = {}
         self.child_crunchbase = []
-        self.grid = []
+        self.ror = []
         self.regex = []
         self.bgov_id = []
         self.comment = None
@@ -130,14 +130,14 @@ def add_child_crunchbase(self, uuid, url):
             if crunchbase not in self.child_crunchbase and crunchbase != self.crunchbase:
                 self.child_crunchbase.append(crunchbase)
 
-    def add_grid(self, grid):
+    def add_ror(self, ror):
         """
-        Adding GRID (from grid.ac) for aggregation
-        :param grid: grid value
+        Adding ROR for aggregation
+        :param ror: ror value
         :return:
         """
-        if grid and grid not in self.grid:
-            self.grid.append(grid)
+        if ror and ror not in self.ror:
+            self.ror.append(ror)
 
     def add_regex(self, regex):
         """
@@ -369,8 +369,8 @@ def update_organization_identifiers(self, org, org_id):
             org_info.add_child_crunchbase(org["crunchbase"]["crunchbase_uuid"], org["crunchbase"]["crunchbase_url"])
         else:
             org_info.add_crunchbase(org["crunchbase"]["crunchbase_uuid"], org["crunchbase"]["crunchbase_url"])
-        for grid in org["grid"]:
-            org_info.add_grid(grid)
+        for ror in org["ror_id"]:
+            org_info.add_ror(ror)
         org_info.add_regex(org["regex"])
         org_info.add_linkedin(org["linkedin"])
         org_info.add_bgov_id(org["BGOV_id"])
@@ -408,7 +408,7 @@ def print_output(self, output_file, local):
                   "aliases": org_info.aliases, "parent": org_info.parent,
                   "permid": org_info.permid, "market": org_info.market,
                   "crunchbase": org_info.crunchbase, "child_crunchbase": org_info.child_crunchbase,
-                  "grid": org_info.grid, "regex": org_info.regex,
+                  "ror_id": org_info.ror, "regex": org_info.regex,
                   "BGOV_id": org_info.bgov_id, "linkedin": org_info.linkedin,
                   "in_sandp_500": org_info.in_sandp_500, "in_fortune_global_500": org_info.in_fortune_global_500,
                   "comment": org_info.comment, "children": org_info.children,
@@ -416,7 +416,7 @@ def print_output(self, output_file, local):
             out.write(json.dumps(js, ensure_ascii=False) + "\n")
         out.close()
         if not local:
-            subprocess.run(["gsutil", "-m", "cp", "-r", output_file, "gs://parat/"], check=True)
+            subprocess.run(["gsutil", "-m", "cp", "-r", output_file, "gs://airflow-data-exchange/ai_companies_visualization/tmp/"], check=True)
 
 
 def aggregate_organizations(output_file, local=False):
diff --git a/company_linkage/parat_scripts/test_aggregate_organizations.py b/company_linkage/parat_scripts/test_aggregate_organizations.py
index c0287b21..c9e31397 100644
--- a/company_linkage/parat_scripts/test_aggregate_organizations.py
+++ b/company_linkage/parat_scripts/test_aggregate_organizations.py
@@ -18,7 +18,7 @@ def test_init(self):
         self.assertEqual(org.market, [])
         self.assertEqual(org.crunchbase, {})
         self.assertEqual(org.child_crunchbase, [])
-        self.assertEqual(org.grid, [])
+        self.assertEqual(org.ror, [])
         self.assertEqual(org.regex, [])
         self.assertEqual(org.bgov_id, [])
         self.assertEqual(org.comment, None)
@@ -116,18 +116,18 @@ def test_add_child_crunchbase(self):
                                  "https://www.crunchbase.com/organization/algorithmia")
         self.assertEqual(len(org.child_crunchbase), 2)
 
-    def test_add_grid(self):
+    def test_add_ror(self):
         org = aggregate_organizations.Organization(1, "test")
-        org.add_grid("grid.419660.c")
-        self.assertEqual(org.grid[0], "grid.419660.c")
-        self.assertEqual(len(org.grid), 1)
+        org.add_ror("https://ror.org/05a8p8995")
+        self.assertEqual(org.ror[0], "https://ror.org/05a8p8995")
+        self.assertEqual(len(org.ror), 1)
         # Don't add a duplicate entry!
-        org.add_grid("grid.419660.c")
-        self.assertEqual(len(org.grid), 1)
+        org.add_ror("https://ror.org/05a8p8995")
+        self.assertEqual(len(org.ror), 1)
         # Do add a new one
-        org.add_grid("grid.481863.0")
-        self.assertEqual(org.grid[1], "grid.481863.0")
-        self.assertEqual(len(org.grid), 2)
+        org.add_ror("https://ror.org/00kdbj440")
+        self.assertEqual(org.ror[1], "https://ror.org/00kdbj440")
+        self.assertEqual(len(org.ror), 2)
 
     def test_add_regex(self):
         org = aggregate_organizations.Organization(1, "test")
@@ -157,8 +157,8 @@ def test_add_bgov_id(self):
 
     def test_add_comment(self):
         org = aggregate_organizations.Organization(1, "test")
-        org.add_comment("grid id not available")
-        self.assertEqual(org.comment, "grid id not available")
+        org.add_comment("crunchbase id not available")
+        self.assertEqual(org.comment, "crunchbase id not available")
         other_org = aggregate_organizations.Organization(2, "test_2")
         other_org.add_comment("")
         self.assertEqual(other_org.comment, None)
diff --git a/company_linkage/push_to_airflow.sh b/company_linkage/push_to_airflow.sh
index 7d3813b7..675430e0 100755
--- a/company_linkage/push_to_airflow.sh
+++ b/company_linkage/push_to_airflow.sh
@@ -15,4 +15,6 @@ gsutil cp sequences/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sequences/p
 gsutil rm gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/parat/*
 gsutil cp sql/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/parat/
 gsutil cp schemas/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/schemas/parat/
+gsutil rm -r gs://airflow-data-exchange/parat/schemas/*
+gsutil cp schemas/* gs://airflow-data-exchange/parat/schemas/
 gsutil -m cp -r parat_scripts/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/parat_scripts/
\ No newline at end of file
diff --git a/company_linkage/schemas/aggregated_organizations.json b/company_linkage/schemas/aggregated_organizations.json
index 40cf0ef4..22aa3e28 100644
--- a/company_linkage/schemas/aggregated_organizations.json
+++ b/company_linkage/schemas/aggregated_organizations.json
@@ -157,9 +157,9 @@
   },
   {
     "mode": "REPEATED",
-    "name": "grid",
+    "name": "ror_id",
     "type": "STRING",
-    "description": "The company's GRID identifier."
+    "description": "The company's ROR identifier."
   },
   {
     "mode": "REPEATED",
diff --git a/company_linkage/sequences/initial_data.csv b/company_linkage/sequences/initial_data.csv
index a6e65b82..c1a10432 100644
--- a/company_linkage/sequences/initial_data.csv
+++ b/company_linkage/sequences/initial_data.csv
@@ -1,6 +1,6 @@
 high_resolution_entities,organizations
-ai_companies_visualization,ai_publications
-ai_companies_visualization,linked_ai_patents
-ai_companies_visualization,top_conference_pubs
-ai_companies_visualization,pubs_in_top_conferences
-ai_companies_visualization,all_publications
\ No newline at end of file
+staging_ai_companies_visualization,ai_publications
+staging_ai_companies_visualization,linked_ai_patents
+staging_ai_companies_visualization,top_conference_pubs
+staging_ai_companies_visualization,pubs_in_top_conferences
+staging_ai_companies_visualization,all_publications
\ No newline at end of file
diff --git a/company_linkage/sequences/visualization_data.csv b/company_linkage/sequences/visualization_data.csv
new file mode 100644
index 00000000..49ce1dd8
--- /dev/null
+++ b/company_linkage/sequences/visualization_data.csv
@@ -0,0 +1 @@
+staging_ai_companies_visualization,initial_visualization_data
diff --git a/company_linkage/sql/all_publications.sql b/company_linkage/sql/all_publications.sql
index 6631575f..e11ef86f 100644
--- a/company_linkage/sql/all_publications.sql
+++ b/company_linkage/sql/all_publications.sql
@@ -1,6 +1,4 @@
-  -- Pulling every publication id linked to every author affiliate and all years because we'll want those later for yearly counts
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.all_publications AS
+  -- Pulling every publication id linked to every author affiliate and all years because we'll want those later for yearly count
 WITH
   ror AS (
     -- Adding in org names and country data using ROR
diff --git a/company_linkage/sql/pubs_in_top_conferences.sql b/company_linkage/sql/pubs_in_top_conferences.sql
index cc1bee22..3b23ee19 100644
--- a/company_linkage/sql/pubs_in_top_conferences.sql
+++ b/company_linkage/sql/pubs_in_top_conferences.sql
@@ -27,7 +27,7 @@ SELECT
   ror_id,
   year
 FROM
-  ai_companies_visualization.top_conference_pubs AS top_pubs
+  staging_ai_companies_visualization.top_conference_pubs AS top_pubs
   -- We're inner joining because if there's no affiliate information at all we have no way to even evaluate this data for our purposes
 INNER JOIN
   affils

From cf25b26ddae88a152b1f9d344f9a9ab7b089bd38 Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Tue, 10 Oct 2023 16:02:16 -0400
Subject: [PATCH 04/17] Refactor names for idempotence; create sequences file

---
 company_linkage/README.md                     | 38 +++++++++----------
 company_linkage/parat_data_dag.py             |  4 +-
 .../sequences/visualization_data.csv          | 18 +++++++++
 .../sql/adding_paper_patent_data.sql          | 34 -----------------
 ...l => initial_paper_visualization_data.sql} |  2 -
 ... => initial_patent_visualization_data.sql} |  2 -
 ...ons.sql => initial_visualization_data.sql} |  0
 ... initial_workforce_visualization_data.sql} |  0
 company_linkage/sql/merged_ai_papers.sql      | 13 -------
 company_linkage/sql/omitting_companies.sql    | 15 --------
 ...apers.sql => paper_visualization_data.sql} |  0
 ...aper_visualization_data_with_clusters.sql} |  0
 ...lization_data_with_company_references.sql} |  0
 ... => paper_visualization_data_with_mag.sql} |  0
 ...paper_visualization_data_with_methods.sql} |  0
 ...> paper_visualization_data_with_tasks.sql} |  0
 ...ents.sql => patent_visualization_data.sql} |  0
 ...atent_visualization_data_with_by_year.sql} |  0
 ...ny_metadata.sql => visualization_data.sql} |  0
 ...ql => visualization_data_omit_by_rule.sql} |  0
 ...=> visualization_data_with_all_papers.sql} |  0
 ...ql => visualization_data_with_by_year.sql} |  0
 ...=> visualization_data_with_top_papers.sql} |  0
 ...e.sql => workforce_visualization_data.sql} |  0
 ...force_visualization_data_with_ai_jobs.sql} |  0
 25 files changed, 39 insertions(+), 87 deletions(-)
 delete mode 100644 company_linkage/sql/adding_paper_patent_data.sql
 rename company_linkage/sql/{creating_paper_visualization_data.sql => initial_paper_visualization_data.sql} (90%)
 rename company_linkage/sql/{creating_patent_visualization_data.sql => initial_patent_visualization_data.sql} (98%)
 rename company_linkage/sql/{creating_initial_visualization_data_publications.sql => initial_visualization_data.sql} (100%)
 rename company_linkage/sql/{creating_workforce_visualization_data.sql => initial_workforce_visualization_data.sql} (100%)
 delete mode 100644 company_linkage/sql/merged_ai_papers.sql
 delete mode 100644 company_linkage/sql/omitting_companies.sql
 rename company_linkage/sql/{omit_by_rule_papers.sql => paper_visualization_data.sql} (100%)
 rename company_linkage/sql/{adding_top_science_map_clusters.sql => paper_visualization_data_with_clusters.sql} (100%)
 rename company_linkage/sql/{adding_company_references.sql => paper_visualization_data_with_company_references.sql} (100%)
 rename company_linkage/sql/{adding_top_mag_ai_fields.sql => paper_visualization_data_with_mag.sql} (100%)
 rename company_linkage/sql/{adding_top_methods.sql => paper_visualization_data_with_methods.sql} (100%)
 rename company_linkage/sql/{adding_top_tasks.sql => paper_visualization_data_with_tasks.sql} (100%)
 rename company_linkage/sql/{omit_by_rule_patents.sql => patent_visualization_data.sql} (100%)
 rename company_linkage/sql/{adding_ai_patents_by_year_to_visualization.sql => patent_visualization_data_with_by_year.sql} (100%)
 rename company_linkage/sql/{adding_crunchbase_company_metadata.sql => visualization_data.sql} (100%)
 rename company_linkage/sql/{omit_by_rule.sql => visualization_data_omit_by_rule.sql} (100%)
 rename company_linkage/sql/{adding_all_paper_counts.sql => visualization_data_with_all_papers.sql} (100%)
 rename company_linkage/sql/{adding_ai_pubs_by_year_to_visualization.sql => visualization_data_with_by_year.sql} (100%)
 rename company_linkage/sql/{adding_top_paper_counts.sql => visualization_data_with_top_papers.sql} (100%)
 rename company_linkage/sql/{omit_by_rule_workforce.sql => workforce_visualization_data.sql} (100%)
 rename company_linkage/sql/{adding_ai_jobs_to_workforce_visualization.sql => workforce_visualization_data_with_ai_jobs.sql} (100%)

diff --git a/company_linkage/README.md b/company_linkage/README.md
index 47438633..e3248a3b 100644
--- a/company_linkage/README.md
+++ b/company_linkage/README.md
@@ -31,25 +31,25 @@ run some of this code as-is.
 13. Upload `top_paper_counts.jsonl` to `ai_companies_visualization.top_paper_counts` using the [top_papers_schema](schemas/top_papers_schema.json)
 14. `python3 all_papers.py all_paper_counts.jsonl`
 15. Upload `all_paper_counts.jsonl` to `ai_companies_visualization.total_paper_counts` using the [all_papers_schema](schemas/all_papers_schema.json)
-16. [creating_initial_visualization_data_publications.sql](sql/creating_initial_visualization_data_publications.sql)
-17. [adding_ai_pubs_by_year_to_visualization.sql](sql/adding_ai_pubs_by_year_to_visualization.sql)
-18. [creating_patent_visualization_data.sql](sql/creating_patent_visualization_data.sql)
-19. [adding_ai_patents_by_year_to_visualization.sql](sql/adding_ai_patents_by_year_to_visualization.sql)
-20. [creating_paper_visualization_data.sql](sql/creating_paper_visualization_data.sql)
-21. [adding_top_mag_ai_fields.sql](sql/adding_top_mag_ai_fields.sql)
-22. [adding_top_science_map_clusters.sql](sql/adding_top_science_map_clusters.sql)
-23. [adding_company_references.sql](sql/adding_company_references.sql)
-24. [adding_top_tasks.sql](sql/adding_top_tasks.sql)
-25. [adding_top_methods.sql](sql/adding_top_methods.sql)
-26. [adding_top_paper_counts.sql](sql/adding_top_paper_counts.sql)
-27. [adding_all_paper_counts.sql](sql/adding_all_paper_counts.sql)
-28. [creating_workforce_visualization_data.sql](sql/creating_workforce_visualization_data.sql)
-29. [adding_ai_jobs_to_workforce_visualization.sql](sql/adding_ai_jobs_to_workforce_visualization.sql)
-30. [omit_by_rule.sql](sql/omit_by_rule.sql)
-31. [omit_by_rule_papers.sql](sql/omit_by_rule_papers.sql)
-32. [omit_by_rule_patents.sql](sql/omit_by_rule_patents.sql)
-33. [omit_by_rule_workforce.sql](sql/omit_by_rule_workforce.sql)
-34. [adding_crunchbase_company_metadata.sql](sql/adding_crunchbase_company_metadata.sql)
+16. [initial_visualization_data.sql](sql/initial_visualization_data.sql)
+17. [visualization_data_with_by_year.sql](sql/visualization_data_with_by_year.sql)
+18. [initial_patent_visualization_data.sql](sql/initial_patent_visualization_data.sql)
+19. [patent_visualization_data_with_by_year.sql](sql/patent_visualization_data_with_by_year.sql)
+20. [initial_paper_visualization_data.sql](sql/initial_paper_visualization_data.sql)
+21. [paper_visualization_data_with_mag.sql](sql/paper_visualization_data_with_mag.sql)
+22. [paper_visualization_data_with_clusters.sql](sql/paper_visualization_data_with_clusters.sql)
+23. [paper_visualization_data_with_company_references.sql](sql/paper_visualization_data_with_company_references.sql)
+24. [paper_visualization_data_with_tasks.sql](sql/paper_visualization_data_with_tasks.sql)
+25. [paper_visualization_data_with_methods.sql](sql/paper_visualization_data_with_methods.sql)
+26. [visualization_data_with_top_papers.sql](sql/visualization_data_with_top_papers.sql)
+27. [visualization_data_with_all_papers.sql](sql/visualization_data_with_all_papers.sql)
+28. [initial_workforce_visualization_data.sql](sql/initial_workforce_visualization_data.sql)
+29. [workforce_visualization_data_with_ai_jobs.sql](sql/workforce_visualization_data_with_ai_jobs.sql)
+30. [visualization_data_omit_by_rule.sql](sql/visualization_data_omit_by_rule.sql)
+31. [paper_visualization_data.sql](sql/paper_visualization_data.sql)
+32. [patent_visualization_data.sql](sql/patent_visualization_data.sql)
+33. [workforce_visualization_data.sql](sql/workforce_visualization_data.sql)
+34. [visualization_data.sql](sql/visualization_data.sql)
 
 # Deployment
 
diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py
index 2416ff86..029a0ec3 100644
--- a/company_linkage/parat_data_dag.py
+++ b/company_linkage/parat_data_dag.py
@@ -160,7 +160,7 @@
         cmds=["/bin/bash"],
         arguments=["-c", (f"echo 'getting AI counts!' ; rm -r ai || true ; "
                           f"mkdir -p ai && "
-                          f"python3 get_ai_counts.py ai/ai_company_papers.jsonl ai/ai_company_patents.jsonl"
+                          f"python3 get_ai_counts.py ai/ai_company_papers.jsonl ai/ai_company_patents.jsonl && "
                           f"gsutil -m cp -r ai gs://{DATA_BUCKET}/{tmp_dir}/ ")],
         namespace="default",
         image=f"us.gcr.io/{PROJECT_ID}/parat",
@@ -218,7 +218,7 @@
             cmds=["/bin/bash"],
             arguments=["-c", (f"echo 'getting {paper_type} paper counts!' ; rm -r {paper_type} || true ; "
                               f"mkdir -p {paper_type} && "
-                              f"python3 {paper_type}_papers.py {paper_type}/{paper_type}_paper_counts.jsonl"
+                              f"python3 {paper_type}_papers.py {paper_type}/{paper_type}_paper_counts.jsonl && "
                               f"gsutil -m cp -r {paper_type} gs://{DATA_BUCKET}/{tmp_dir}/ ")],
             namespace="default",
             image=f"us.gcr.io/{PROJECT_ID}/parat",
diff --git a/company_linkage/sequences/visualization_data.csv b/company_linkage/sequences/visualization_data.csv
index 49ce1dd8..f6a24560 100644
--- a/company_linkage/sequences/visualization_data.csv
+++ b/company_linkage/sequences/visualization_data.csv
@@ -1 +1,19 @@
 staging_ai_companies_visualization,initial_visualization_data
+staging_ai_companies_visualization,visualization_data_with_by_year
+staging_ai_companies_visualization,visualization_data_with_top_papers
+staging_ai_companies_visualization,visualization_data_with_all_papers
+staging_ai_companies_visualization,visualization_data_omit_by_rule
+staging_ai_companies_visualization,visualization_data
+staging_ai_companies_visualization,initial_patent_visualization_data
+staging_ai_companies_visualization,patent_visualization_data_with_by_year
+staging_ai_companies_visualization,patent_visualization_data
+staging_ai_companies_visualization,initial_paper_visualization_data
+staging_ai_companies_visualization,paper_visualization_data_with_mag
+staging_ai_companies_visualization,paper_visualization_data_with_clusters
+staging_ai_companies_visualization,paper_visualization_data_with_company_references
+staging_ai_companies_visualization,paper_visualization_data_with_tasks
+staging_ai_companies_visualization,paper_visualization_data_with_methods
+staging_ai_companies_visualization,paper_visualization_data
+staging_ai_companies_visualization,initial_workforce_visualization_data
+staging_ai_companies_visualization,workforce_visualization_data_with_ai_jobs
+staging_ai_companies_visualization,workforce_visualization_data
\ No newline at end of file
diff --git a/company_linkage/sql/adding_paper_patent_data.sql b/company_linkage/sql/adding_paper_patent_data.sql
deleted file mode 100644
index b075c7c7..00000000
--- a/company_linkage/sql/adding_paper_patent_data.sql
+++ /dev/null
@@ -1,34 +0,0 @@
--- DEPRECATED, REMOVE SOON
--- Update the visualization table itself to add paper and patent data
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.visualization_data AS
-  -- Pull in the paper and patent counts, along with the CSET ids to link them in
-WITH
-  count_data AS (
-  SELECT
-    CSET_id,
-    ai_pubs,
-    ai_pubs_by_year,
-    ai_patents,
-    ai_patents_by_year
-  FROM
-    `gcp-cset-projects.ai_companies_visualization.paper_patent_counts`),
-    -- Pull in the current visualization data. Exclude the ai_pubs data, since that was included when we built the paper/patent data, so we don't need it
-  viz_data AS (
-  SELECT
-    * EXCEPT(ai_pubs, ai_pubs_by_year)
-  FROM
-    `gcp-cset-projects.ai_companies_visualization.visualization_data`)
-    -- Join the two together using the CSET id
-SELECT
-  viz_data.*,
-  ai_pubs,
-  ai_pubs_by_year,
-  ai_patents,
-  ai_patents_by_year
-FROM
-  viz_data
-LEFT JOIN
-  count_data
-ON
-  viz_data.CSET_id = count_data.CSET_id
\ No newline at end of file
diff --git a/company_linkage/sql/creating_paper_visualization_data.sql b/company_linkage/sql/initial_paper_visualization_data.sql
similarity index 90%
rename from company_linkage/sql/creating_paper_visualization_data.sql
rename to company_linkage/sql/initial_paper_visualization_data.sql
index 5aacbb79..ba20b588 100644
--- a/company_linkage/sql/creating_paper_visualization_data.sql
+++ b/company_linkage/sql/initial_paper_visualization_data.sql
@@ -1,5 +1,3 @@
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.paper_visualization_data AS
 WITH
   get_citations AS (
   SELECT
diff --git a/company_linkage/sql/creating_patent_visualization_data.sql b/company_linkage/sql/initial_patent_visualization_data.sql
similarity index 98%
rename from company_linkage/sql/creating_patent_visualization_data.sql
rename to company_linkage/sql/initial_patent_visualization_data.sql
index 134c75b8..7720057f 100644
--- a/company_linkage/sql/creating_patent_visualization_data.sql
+++ b/company_linkage/sql/initial_patent_visualization_data.sql
@@ -1,5 +1,3 @@
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.patent_visualization_data AS
 WITH
   aipats AS (
     -- Pulling all the patents from any of our companies
diff --git a/company_linkage/sql/creating_initial_visualization_data_publications.sql b/company_linkage/sql/initial_visualization_data.sql
similarity index 100%
rename from company_linkage/sql/creating_initial_visualization_data_publications.sql
rename to company_linkage/sql/initial_visualization_data.sql
diff --git a/company_linkage/sql/creating_workforce_visualization_data.sql b/company_linkage/sql/initial_workforce_visualization_data.sql
similarity index 100%
rename from company_linkage/sql/creating_workforce_visualization_data.sql
rename to company_linkage/sql/initial_workforce_visualization_data.sql
diff --git a/company_linkage/sql/merged_ai_papers.sql b/company_linkage/sql/merged_ai_papers.sql
deleted file mode 100644
index 9f3478f2..00000000
--- a/company_linkage/sql/merged_ai_papers.sql
+++ /dev/null
@@ -1,13 +0,0 @@
-CREATE OR REPLACE TABLE
-  `gcp-cset-projects.ai_companies_visualization.ai_company_pubs` AS
-SELECT
-  DISTINCT *
-FROM
-  `gcp-cset-projects.ai_companies_visualization.ai_company_pubs`
-UNION DISTINCT
-SELECT
-  DISTINCT *
-FROM
-  `gcp-cset-projects.ai_companies_visualization.ai_company_pubs_no_grid`
-ORDER BY
-  id
\ No newline at end of file
diff --git a/company_linkage/sql/omitting_companies.sql b/company_linkage/sql/omitting_companies.sql
deleted file mode 100644
index 1d5d3cef..00000000
--- a/company_linkage/sql/omitting_companies.sql
+++ /dev/null
@@ -1,15 +0,0 @@
--- DEPRECATED, REMOVE WHEN READY
--- We want to omit companies from the visualization
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.visualization_data AS
-SELECT
-  *
-FROM
-  `gcp-cset-projects.ai_companies_visualization.visualization_data`
- -- Omitting companies based on list
-WHERE
-  CSET_id NOT IN (
-  SELECT
-    *
-  FROM
-    ai_companies_visualization.omit)
\ No newline at end of file
diff --git a/company_linkage/sql/omit_by_rule_papers.sql b/company_linkage/sql/paper_visualization_data.sql
similarity index 100%
rename from company_linkage/sql/omit_by_rule_papers.sql
rename to company_linkage/sql/paper_visualization_data.sql
diff --git a/company_linkage/sql/adding_top_science_map_clusters.sql b/company_linkage/sql/paper_visualization_data_with_clusters.sql
similarity index 100%
rename from company_linkage/sql/adding_top_science_map_clusters.sql
rename to company_linkage/sql/paper_visualization_data_with_clusters.sql
diff --git a/company_linkage/sql/adding_company_references.sql b/company_linkage/sql/paper_visualization_data_with_company_references.sql
similarity index 100%
rename from company_linkage/sql/adding_company_references.sql
rename to company_linkage/sql/paper_visualization_data_with_company_references.sql
diff --git a/company_linkage/sql/adding_top_mag_ai_fields.sql b/company_linkage/sql/paper_visualization_data_with_mag.sql
similarity index 100%
rename from company_linkage/sql/adding_top_mag_ai_fields.sql
rename to company_linkage/sql/paper_visualization_data_with_mag.sql
diff --git a/company_linkage/sql/adding_top_methods.sql b/company_linkage/sql/paper_visualization_data_with_methods.sql
similarity index 100%
rename from company_linkage/sql/adding_top_methods.sql
rename to company_linkage/sql/paper_visualization_data_with_methods.sql
diff --git a/company_linkage/sql/adding_top_tasks.sql b/company_linkage/sql/paper_visualization_data_with_tasks.sql
similarity index 100%
rename from company_linkage/sql/adding_top_tasks.sql
rename to company_linkage/sql/paper_visualization_data_with_tasks.sql
diff --git a/company_linkage/sql/omit_by_rule_patents.sql b/company_linkage/sql/patent_visualization_data.sql
similarity index 100%
rename from company_linkage/sql/omit_by_rule_patents.sql
rename to company_linkage/sql/patent_visualization_data.sql
diff --git a/company_linkage/sql/adding_ai_patents_by_year_to_visualization.sql b/company_linkage/sql/patent_visualization_data_with_by_year.sql
similarity index 100%
rename from company_linkage/sql/adding_ai_patents_by_year_to_visualization.sql
rename to company_linkage/sql/patent_visualization_data_with_by_year.sql
diff --git a/company_linkage/sql/adding_crunchbase_company_metadata.sql b/company_linkage/sql/visualization_data.sql
similarity index 100%
rename from company_linkage/sql/adding_crunchbase_company_metadata.sql
rename to company_linkage/sql/visualization_data.sql
diff --git a/company_linkage/sql/omit_by_rule.sql b/company_linkage/sql/visualization_data_omit_by_rule.sql
similarity index 100%
rename from company_linkage/sql/omit_by_rule.sql
rename to company_linkage/sql/visualization_data_omit_by_rule.sql
diff --git a/company_linkage/sql/adding_all_paper_counts.sql b/company_linkage/sql/visualization_data_with_all_papers.sql
similarity index 100%
rename from company_linkage/sql/adding_all_paper_counts.sql
rename to company_linkage/sql/visualization_data_with_all_papers.sql
diff --git a/company_linkage/sql/adding_ai_pubs_by_year_to_visualization.sql b/company_linkage/sql/visualization_data_with_by_year.sql
similarity index 100%
rename from company_linkage/sql/adding_ai_pubs_by_year_to_visualization.sql
rename to company_linkage/sql/visualization_data_with_by_year.sql
diff --git a/company_linkage/sql/adding_top_paper_counts.sql b/company_linkage/sql/visualization_data_with_top_papers.sql
similarity index 100%
rename from company_linkage/sql/adding_top_paper_counts.sql
rename to company_linkage/sql/visualization_data_with_top_papers.sql
diff --git a/company_linkage/sql/omit_by_rule_workforce.sql b/company_linkage/sql/workforce_visualization_data.sql
similarity index 100%
rename from company_linkage/sql/omit_by_rule_workforce.sql
rename to company_linkage/sql/workforce_visualization_data.sql
diff --git a/company_linkage/sql/adding_ai_jobs_to_workforce_visualization.sql b/company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql
similarity index 100%
rename from company_linkage/sql/adding_ai_jobs_to_workforce_visualization.sql
rename to company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql

From 4694433532e94d71beddee7ec75e25917e10c877 Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Tue, 10 Oct 2023 16:55:50 -0400
Subject: [PATCH 05/17] Switch AI count scripts to ROR; update workforce data

---
 company_linkage/parat_scripts/all_papers.py   |  2 +-
 .../parat_scripts/get_ai_counts.py            | 50 ++++++++++---------
 .../parat_scripts/test_ai_counts.py           | 12 ++---
 company_linkage/parat_scripts/top_papers.py   |  2 +-
 .../initial_workforce_visualization_data.sql  | 16 +++---
 5 files changed, 42 insertions(+), 40 deletions(-)

diff --git a/company_linkage/parat_scripts/all_papers.py b/company_linkage/parat_scripts/all_papers.py
index 9fe5d534..f9234069 100644
--- a/company_linkage/parat_scripts/all_papers.py
+++ b/company_linkage/parat_scripts/all_papers.py
@@ -18,7 +18,7 @@ def main() -> None:
     paper_finder.get_identifiers()
     # These are the only two lines that make this different from running AI pubs
     # We select from a different table
-    table_name = "ai_companies_visualization.all_publications"
+    table_name = "staging_ai_companies_visualization.all_publications"
     # And we write out our data to a different variable
     companies = paper_finder.run_query_papers(table_name, "all_pubs", by_year=True)
     paper_finder.write_output(companies, args.output_file)
diff --git a/company_linkage/parat_scripts/get_ai_counts.py b/company_linkage/parat_scripts/get_ai_counts.py
index 504099de..681f704d 100644
--- a/company_linkage/parat_scripts/get_ai_counts.py
+++ b/company_linkage/parat_scripts/get_ai_counts.py
@@ -14,7 +14,7 @@ def __init__(self) -> None:
         AI papers in top conferences, etc.) and AI patents (from Dimensions and 1790 jointly).
         """
         self.regex_dict = defaultdict(list)
-        self.grid_dict = defaultdict(list)
+        self.ror_dict = defaultdict(list)
         self.cset_ids = []
         self.company_ids = []
         self.patent_fields = ["Physical_Sciences_and_Engineering",
@@ -58,7 +58,7 @@ def get_identifiers(self) -> None:
         Pulling the regular expressions used to find papers and patents through means other than GRID.
         :return:
         """
-        regex_query = """SELECT CSET_id, regex, grid FROM
+        regex_query = """SELECT CSET_id, regex, ror_id FROM
                             `gcp-cset-projects.high_resolution_entities.aggregated_organizations`"""
         client = bigquery.Client()
         query_job = client.query(regex_query)
@@ -67,15 +67,15 @@ def get_identifiers(self) -> None:
             if result.regex:
                 for regex in result.regex:
                     self.regex_dict[result.CSET_id].append(regex)
-            if result.grid:
-                for grid_id in result.grid:
-                    self.grid_dict[result.CSET_id].append(grid_id)
+            if result.ror_id:
+                for ror in result.ror_id:
+                    self.ror_dict[result.CSET_id].append(ror)
             self.cset_ids.append(result.CSET_id)
 
     def run_query_papers(self, table_name: str, field_name: str, test: bool = False, by_year: bool = False) -> list:
         """
-        Running a query to find paper counts using regex for papers missing GRID. This query combines
-        this data with preexisting paper counts already identified using SQL for papers that have GRID.
+        Running a query to find paper counts using regex for papers missing ROR. This query combines
+        this data with preexisting paper counts already identified using SQL for papers that have ROR.
         We no longer use this query for AI papers, but it is still used for top conference papers and
         total papers.
         :param table_name: The table to look for papers in
@@ -100,8 +100,8 @@ def run_query_papers(self, table_name: str, field_name: str, test: bool = False,
                 if len(regexes) > 1:
                     for regex in regexes[1:]:
                         query += f"""OR regexp_contains(org_name, r'(?i){regex}') """
-                if cset_id in self.grid_dict:
-                    query += f"""OR grid_id IN ({str(self.grid_dict[cset_id])[1:-1]})"""
+                if cset_id in self.ror_dict:
+                    query += f"""OR ror_id IN ({str(self.ror_dict[cset_id])[1:-1]})"""
                 query_job = client.query(query)
                 # query_job is an iterator, so even though we're only returning one row we're going to loop
                 for element in query_job:
@@ -109,7 +109,7 @@ def run_query_papers(self, table_name: str, field_name: str, test: bool = False,
                 # if we don't have total data, we won't have by_year either
                 if by_year:
                     row_dict[field_name_by_year] = self.run_query_papers_by_year(table_name, field_name, regexes,
-                                                                                 self.grid_dict[cset_id])
+                                                                                 self.ror_dict[cset_id])
             if not row_dict[field_name]:
                 # if we end up without any papers, set that to be true
                 row_dict[field_name] = 0
@@ -119,7 +119,7 @@ def run_query_papers(self, table_name: str, field_name: str, test: bool = False,
             companies.append(row_dict)
         return companies
 
-    def run_query_papers_by_year(self, table_name: str, field_name: str, regexes: list, grids: list) -> list:
+    def run_query_papers_by_year(self, table_name: str, field_name: str, regexes: list, rors: list) -> list:
         """
         Getting the same paper count data, except split by year.
         We no longer use this query for AI papers, but it is still used for top conference papers and
@@ -127,7 +127,7 @@ def run_query_papers_by_year(self, table_name: str, field_name: str, regexes: li
         :param table_name: The table to look for papers in
         :param field_name: The json field name
         :param regexes: The regexes for whichever CSET_id we're searching for
-        :param grids: The grids for whichever CSET_id we're searching for if they exist; otherwise an empty list
+        :param rors: The rors for whichever CSET_id we're searching for if they exist; otherwise an empty list
         :return:
         """
         field_name_by_year = f"{field_name}_by_year"
@@ -143,8 +143,8 @@ def run_query_papers_by_year(self, table_name: str, field_name: str, regexes: li
             for regex in regexes[1:]:
                 # regex_to_use = rf"r'(?i){regex}'"
                 query += f"""OR regexp_contains(org_name, r'(?i){regex}') """
-        if grids:
-            query += f"""OR grid_id IN ({str(grids)[1:-1]}) """
+        if rors:
+            query += f"""OR ror_id IN ({str(rors)[1:-1]}) """
         query += """GROUP BY year ORDER BY year"""
         client = bigquery.Client()
         query_job = client.query(query)
@@ -160,7 +160,7 @@ def run_query_id_papers(self, table_name: str, test: bool = False) -> list:
         :param test: False if not running as a unit test
         :return:
         """
-        companies_query = f"""SELECT CSET_id, grid FROM 
+        companies_query = f"""SELECT CSET_id, ror_id FROM 
         `gcp-cset-projects.high_resolution_entities.aggregated_organizations`"""
         if test:
             companies_query += """ LIMIT 25"""
@@ -180,9 +180,9 @@ def run_query_id_papers(self, table_name: str, test: bool = False) -> list:
                 if len(regexes) > 1:
                     for regex in regexes[1:]:
                         query += f"""OR regexp_contains(org_name, r'(?i){regex}') """
-                if row["grid"]:
-                    self.grid_dict[row["CSET_id"]] = row["grid"]
-                    query += f"""OR grid_id IN ({str(row["grid"])[1:-1]})"""
+                if row["ror_id"]:
+                    self.ror_dict[row["CSET_id"]] = row["ror_id"]
+                    query += f"""OR ror_id IN ({str(row["ror_id"])[1:-1]})"""
                 query_job = client.query(query)
                 # get all the merged ids
                 for element in query_job:
@@ -192,11 +192,15 @@ def run_query_id_papers(self, table_name: str, test: bool = False) -> list:
         return company_rows
 
     def run_query_id_patents(self):
+        """
+        Get patent counts one by one using CSET_ids.
+        :return:
+        """
         patent_companies = []
         for cset_id in self.company_ids:
             if cset_id in self.regex_dict:
                 regexes = self.regex_dict[cset_id]
-                grids = self.grid_dict[cset_id]
+                rors = self.ror_dict[cset_id]
                 query = f"""SELECT DISTINCT 
                               family_id,
                               priority_year,
@@ -236,14 +240,14 @@ def run_query_id_patents(self):
                               Machine_Learning,
                               Search_Methods
                               FROM
-                            ai_companies_visualization.linked_ai_patents
+                            staging_ai_companies_visualization.linked_ai_patents
                              WHERE regexp_contains(assignee, r'(?i){regexes[0]}') """
                 # if we have more than one regex for an org, include all of them
                 if len(regexes) > 1:
                     for regex in regexes[1:]:
                         query += f"""OR regexp_contains(assignee, r'(?i){regex}') """
-                if grids:
-                    query += f"""OR grid IN ({str(grids)[1:-1]})"""
+                if rors:
+                    query += f"""OR ror_id IN ({str(rors)[1:-1]})"""
                 client = bigquery.Client()
                 query_job = client.query(query)
                 for row in query_job:
@@ -278,7 +282,7 @@ def main() -> None:
     count_getter = CountGetter()
     print("Fetching identifiers")
     count_getter.get_identifiers()
-    table_name = "gcp-cset-projects.ai_companies_visualization.ai_publications"
+    table_name = "gcp-cset-projects.staging_ai_companies_visualization.ai_publications"
     print("Fetching paper data")
     company_rows = count_getter.run_query_id_papers(table_name)
     print("Writing results")
diff --git a/company_linkage/parat_scripts/test_ai_counts.py b/company_linkage/parat_scripts/test_ai_counts.py
index 3845ffff..926a2731 100644
--- a/company_linkage/parat_scripts/test_ai_counts.py
+++ b/company_linkage/parat_scripts/test_ai_counts.py
@@ -22,7 +22,7 @@ def test_get_identifiers(self):
         count_getter.get_identifiers()
         # the dicts are populated
         self.assertGreater(len(count_getter.regex_dict), 0)
-        self.assertGreater(len(count_getter.grid_dict), 0)
+        self.assertGreater(len(count_getter.ror_dict), 0)
         self.assertGreater(len(count_getter.cset_ids), 0)
         self.assertEqual(type(count_getter.cset_ids), list)
         # the values in the dict are the correct type
@@ -30,16 +30,16 @@ def test_get_identifiers(self):
             self.assertEqual(type(key_val), int)
             # we allow multiple regexes, so we have a list
             self.assertEqual(type(count_getter.regex_dict[key_val]), list)
-        for key_val in count_getter.grid_dict.keys():
+        for key_val in count_getter.ror_dict.keys():
             self.assertEqual(type(key_val), int)
             # we allow multiple regexes, so we have a list
-            self.assertEqual(type(count_getter.grid_dict[key_val]), list)
+            self.assertEqual(type(count_getter.ror_dict[key_val]), list)
 
     @ignore_warnings
     def test_run_query_papers(self):
         count_getter = CountGetter()
         count_getter.get_identifiers()
-        table_name = "gcp-cset-projects.ai_companies_visualization.ai_publications"
+        table_name = "gcp-cset-projects.staging_ai_companies_visualization.ai_publications"
         test = True
         companies = count_getter.run_query_papers(table_name, "ai_pubs", test=test, by_year=False)
         # Make sure we're setting the AI pubs for every company!
@@ -67,7 +67,7 @@ def test_run_query_papers(self):
     def test_run_query_id_papers(self):
         count_getter = CountGetter()
         count_getter.get_identifiers()
-        table_name = "gcp-cset-projects.ai_companies_visualization.ai_publications"
+        table_name = "gcp-cset-projects.staging_ai_companies_visualization.ai_publications"
         test = True
         company_rows = count_getter.run_query_id_papers(table_name, test=test)
         for company_row in company_rows:
@@ -84,7 +84,7 @@ def test_run_query_id_papers(self):
     def test_run_query_id_patents(self):
         count_getter = CountGetter()
         count_getter.get_identifiers()
-        table_name = "gcp-cset-projects.ai_companies_visualization.ai_publications"
+        table_name = "gcp-cset-projects.staging_ai_companies_visualization.ai_publications"
         test = True
         count_getter.run_query_id_papers(table_name, test)
         patent_companies = count_getter.run_query_id_patents()
diff --git a/company_linkage/parat_scripts/top_papers.py b/company_linkage/parat_scripts/top_papers.py
index a11da6af..bc1bbece 100644
--- a/company_linkage/parat_scripts/top_papers.py
+++ b/company_linkage/parat_scripts/top_papers.py
@@ -18,7 +18,7 @@ def main() -> None:
     paper_finder.get_identifiers()
     # These are the only two lines that make this different from running AI pubs
     # We select from a different table
-    table_name = "ai_companies_visualization.pubs_in_top_conferences"
+    table_name = "staging_ai_companies_visualization.pubs_in_top_conferences"
     # And we write out our data to a different variable
     companies = paper_finder.run_query_papers(table_name, "ai_pubs_in_top_conferences", by_year=True)
     paper_finder.write_output(companies, args.output_file)
diff --git a/company_linkage/sql/initial_workforce_visualization_data.sql b/company_linkage/sql/initial_workforce_visualization_data.sql
index 70620d57..1c0f9be9 100644
--- a/company_linkage/sql/initial_workforce_visualization_data.sql
+++ b/company_linkage/sql/initial_workforce_visualization_data.sql
@@ -1,11 +1,9 @@
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.workforce_visualization_data AS
 WITH
   clean_linkedins AS (
   SELECT
     DISTINCT cset_id,
     name,
-    REPLACE(linkedins, "https://www.", "http://") AS linkedin
+    REPLACE(REPLACE(linkedins, "https://www.", ""), "http://www.", "") AS linkedin
   FROM
     high_resolution_entities.aggregated_organizations
   CROSS JOIN
@@ -16,11 +14,11 @@ SELECT
 FROM
   clean_linkedins
 LEFT JOIN
-  `gcp-cset-projects.gcp_cset_revelio.position` position
+  revelio.individual_position
 ON
-  linkedin = company_li_url
+  linkedin = company_linkedin_url
 INNER JOIN
-  gcp_cset_revelio.role_lookup
+  revelio.role_lookup
 USING
   (mapped_role)
 INNER JOIN
@@ -28,12 +26,12 @@ INNER JOIN
 ON
   (k1000 = role_k1000)
 LEFT JOIN
-  gcp_cset_revelio.education
+  revelio.individual_education
 USING
   (user_id)
 WHERE
-  (position.enddate IS NULL
-    OR position.enddate > CURRENT_DATE ())
+  (individual_position.enddate IS NULL
+    OR individual_position.enddate > CURRENT_DATE ())
   AND (ba_req IS FALSE
     OR ((degree = "Bachelor"
         OR degree = "Master"

From 1f88998c0b038d36eaa535c9f12580fcf1e3a5ad Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Thu, 12 Oct 2023 13:39:32 -0400
Subject: [PATCH 06/17] Add visualization data to DAG; update initial tables

---
 company_linkage/parat_data_dag.py             | 41 +++++++++++++++----
 .../sequences/visualization_data.csv          |  8 ++--
 .../sql/initial_visualization_data.sql        | 16 +++-----
 .../visualization_data_with_all_papers.sql    | 11 ++---
 .../sql/visualization_data_with_by_year.sql   | 16 ++++----
 .../visualization_data_with_top_papers.sql    |  8 ++--
 6 files changed, 56 insertions(+), 44 deletions(-)

diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py
index 029a0ec3..0bde5f58 100644
--- a/company_linkage/parat_data_dag.py
+++ b/company_linkage/parat_data_dag.py
@@ -146,11 +146,6 @@
         write_disposition="WRITE_TRUNCATE"
     )
 
-    # TODO: somewhere in here we need to decide whether to load directly to the main table
-    # or to add a transfer step to transfer from staging to the main table; if the latter
-    # are there checks we want to add first?
-    # for now, pretend the data is in the main table already
-
     run_get_ai_counts = GKEStartPodOperator(
         task_id="run_get_ai_counts",
         project_id=PROJECT_ID,
@@ -187,7 +182,7 @@
     load_ai_papers = GCSToBigQueryOperator(
         task_id=f"load_ai_company_papers",
         bucket=DATA_BUCKET,
-        source_objects=[f"{tmp_dir}/ai_company_papers.jsonl"],
+        source_objects=[f"{tmp_dir}/ai/ai_company_papers.jsonl"],
         schema_object=f"{schema_dir}/ai_papers_schema.json",
         destination_project_dataset_table=f"{staging_dataset}.ai_company_papers",
         source_format="NEWLINE_DELIMITED_JSON",
@@ -198,7 +193,7 @@
     load_ai_patents = GCSToBigQueryOperator(
         task_id=f"load_ai_company_patents",
         bucket=DATA_BUCKET,
-        source_objects=[f"{tmp_dir}/ai_company_patents.jsonl"],
+        source_objects=[f"{tmp_dir}/ai/ai_company_patents.jsonl"],
         schema_object=f"{schema_dir}/ai_patents_schema.json",
         destination_project_dataset_table=f"{staging_dataset}.ai_company_patents",
         source_format="NEWLINE_DELIMITED_JSON",
@@ -249,7 +244,7 @@
     load_top_papers = GCSToBigQueryOperator(
         task_id=f"load_top_papers",
         bucket=DATA_BUCKET,
-        source_objects=[f"{tmp_dir}/top_paper_counts.jsonl"],
+        source_objects=[f"{tmp_dir}/top/top_paper_counts.jsonl"],
         schema_object=f"{schema_dir}/top_papers_schema.json",
         destination_project_dataset_table=f"{staging_dataset}.top_paper_counts",
         source_format="NEWLINE_DELIMITED_JSON",
@@ -260,7 +255,7 @@
     load_all_papers = GCSToBigQueryOperator(
         task_id=f"load_all_papers",
         bucket=DATA_BUCKET,
-        source_objects=[f"{tmp_dir}/all_paper_counts.jsonl"],
+        source_objects=[f"{tmp_dir}/all/all_paper_counts.jsonl"],
         schema_object=f"{schema_dir}/all_papers_schema.json",
         destination_project_dataset_table=f"{staging_dataset}.all_paper_counts",
         source_format="NEWLINE_DELIMITED_JSON",
@@ -268,8 +263,35 @@
         write_disposition="WRITE_TRUNCATE"
     )
 
+    start_visualization_tables = DummyOperator(task_id="start_visualization_tables")
+    wait_for_visualization_tables = DummyOperator(task_id="wait_for_visualization_tables")
 
+    visualization_query_sequence = "visualization_data.csv"
 
+    curr = start_visualization_tables
+    for line in open(seq_path_prefix + visualization_query_sequence).readlines():
+        dataset, table = line.split(",")
+        table_name = f"{dataset}.{table.strip()}"
+        next_tab = BigQueryInsertJobOperator(
+            task_id=f"create_{table_name}",
+            configuration={
+                "query": {
+                    "query": "{% include '" + f"{sql_dir}/{table.strip()}.sql" + "' %}",
+                    "useLegacySql": False,
+                    "destinationTable": {
+                        "projectId": PROJECT_ID,
+                        "datasetId": dataset,
+                        "tableId": table
+                    },
+                    "allowLargeResults": True,
+                    "createDisposition": "CREATE_IF_NEEDED",
+                    "writeDisposition": "WRITE_TRUNCATE"
+                }
+            },
+        )
+        curr >> next_tab
+        curr = next_tab
+    curr >> wait_for_visualization_tables
 
 
     (
@@ -288,5 +310,6 @@
         >> run_papers
         >> load_top_papers
         >> load_all_papers
+        >> start_visualization_tables
     )
 
diff --git a/company_linkage/sequences/visualization_data.csv b/company_linkage/sequences/visualization_data.csv
index f6a24560..09ae1630 100644
--- a/company_linkage/sequences/visualization_data.csv
+++ b/company_linkage/sequences/visualization_data.csv
@@ -2,18 +2,18 @@ staging_ai_companies_visualization,initial_visualization_data
 staging_ai_companies_visualization,visualization_data_with_by_year
 staging_ai_companies_visualization,visualization_data_with_top_papers
 staging_ai_companies_visualization,visualization_data_with_all_papers
-staging_ai_companies_visualization,visualization_data_omit_by_rule
-staging_ai_companies_visualization,visualization_data
 staging_ai_companies_visualization,initial_patent_visualization_data
 staging_ai_companies_visualization,patent_visualization_data_with_by_year
-staging_ai_companies_visualization,patent_visualization_data
 staging_ai_companies_visualization,initial_paper_visualization_data
 staging_ai_companies_visualization,paper_visualization_data_with_mag
 staging_ai_companies_visualization,paper_visualization_data_with_clusters
 staging_ai_companies_visualization,paper_visualization_data_with_company_references
 staging_ai_companies_visualization,paper_visualization_data_with_tasks
 staging_ai_companies_visualization,paper_visualization_data_with_methods
-staging_ai_companies_visualization,paper_visualization_data
 staging_ai_companies_visualization,initial_workforce_visualization_data
 staging_ai_companies_visualization,workforce_visualization_data_with_ai_jobs
+staging_ai_companies_visualization,visualization_data_omit_by_rule
+staging_ai_companies_visualization,visualization_data
+staging_ai_companies_visualization,patent_visualization_data
+staging_ai_companies_visualization,paper_visualization_data
 staging_ai_companies_visualization,workforce_visualization_data
\ No newline at end of file
diff --git a/company_linkage/sql/initial_visualization_data.sql b/company_linkage/sql/initial_visualization_data.sql
index 64097b5d..c7eb2642 100644
--- a/company_linkage/sql/initial_visualization_data.sql
+++ b/company_linkage/sql/initial_visualization_data.sql
@@ -1,12 +1,8 @@
 -- This query pulls the initial visualization data for the table that doesn't have to be compiled (as it's already
 -- available in the organizations table) and adds in the AI publication counts.
-
-
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.visualization_data AS
 WITH
   aipubs AS (
-    -- Pulling all the papers with any of the given GRIDs as affiliates
+    -- Pulling all the papers with any of the given RORs as affiliates
   SELECT
     CSET_id,
     merged_id,
@@ -14,8 +10,8 @@ WITH
     nlp,
     robotics
   FROM
-    ai_companies_visualization.ai_company_pubs),
-  gridtable AS (
+    staging_ai_companies_visualization.ai_company_papers),
+  rortable AS (
     -- Getting the count of publications
   SELECT
     CSET_id,
@@ -41,7 +37,7 @@ SELECT
   market,
   crunchbase,
   child_crunchbase,
-  grid,
+  ror_id,
   linkedin,
   in_sandp_500,
   in_fortune_global_500,
@@ -50,8 +46,8 @@ SELECT
   COALESCE(nlp_pubs, 0) as nlp_pubs,
   COALESCE(robotics_pubs, 0) as robotics_pubs
 FROM
-  `gcp-cset-projects.high_resolution_entities.aggregated_organizations` AS orgs
+  high_resolution_entities.aggregated_organizations
 LEFT JOIN
-  gridtable
+  rortable
 USING
   (CSET_id)
\ No newline at end of file
diff --git a/company_linkage/sql/visualization_data_with_all_papers.sql b/company_linkage/sql/visualization_data_with_all_papers.sql
index 073a6dce..67901809 100644
--- a/company_linkage/sql/visualization_data_with_all_papers.sql
+++ b/company_linkage/sql/visualization_data_with_all_papers.sql
@@ -1,6 +1,4 @@
   -- Update the visualization table itself to add total paper data
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.visualization_data AS
   -- Pull in the total paper counts, along with the CSET ids to link them in
 WITH
   count_data AS (
@@ -9,14 +7,13 @@ WITH
     all_pubs,
     all_pubs_by_year,
   FROM
-    `gcp-cset-projects.ai_companies_visualization.total_paper_counts`),
-  -- Pull in the current visualization data. Exclude the all_paper data, since that was included when we built the all paper data, so we don't need it
+    staging_ai_companies_visualization.all_paper_counts),
+  -- Pull in the current visualization data
   viz_data AS (
   SELECT
-    * EXCEPT(all_pubs,
-      all_pubs_by_year)
+    *
   FROM
-    `gcp-cset-projects.ai_companies_visualization.visualization_data`)
+    staging_ai_companies_visualization.visualization_data_with_top_papers)
   -- Join the two together using the CSET id
 SELECT
   viz_data.*,
diff --git a/company_linkage/sql/visualization_data_with_by_year.sql b/company_linkage/sql/visualization_data_with_by_year.sql
index 51db60ec..d229b93f 100644
--- a/company_linkage/sql/visualization_data_with_by_year.sql
+++ b/company_linkage/sql/visualization_data_with_by_year.sql
@@ -1,7 +1,5 @@
 -- Adding AI publication data by year to the visualization table
 -- This uses the same mechanism as adding AI publication counts; we're just doing it on a by-year basis
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.visualization_data AS
 WITH
   aipubs AS (
     -- Pulling all the papers with any of the given GRIDs as affiliates
@@ -13,8 +11,8 @@ WITH
     nlp,
     robotics
   FROM
-    ai_companies_visualization.ai_company_pubs),
-  gridtable AS (
+    staging_ai_companies_visualization.ai_company_papers),
+  rortable AS (
     -- Getting the count of publications
   SELECT
     CSET_id,
@@ -49,23 +47,23 @@ WITH
     ORDER BY
       year) AS robotics_pubs_by_year,
   FROM
-    `gcp-cset-projects.high_resolution_entities.aggregated_organizations` AS orgs
+    high_resolution_entities.aggregated_organizations
   LEFT JOIN
-    gridtable
+    rortable
   USING
     (CSET_id)
   GROUP BY
     CSET_id)
 SELECT
-  viz.*,
+  initial_visualization_data.*,
   ai_pubs_by_year,
   cv_pubs_by_year,
   nlp_pubs_by_year,
   robotics_pubs_by_year
 FROM
-  `gcp-cset-projects.ai_companies_visualization.visualization_data` AS viz
+  staging_ai_companies_visualization.initial_visualization_data
 LEFT JOIN
   by_year
 ON
-  viz.CSET_id = by_year.CSET_id
+  initial_visualization_data.CSET_id = by_year.CSET_id
 ORDER BY cset_id
\ No newline at end of file
diff --git a/company_linkage/sql/visualization_data_with_top_papers.sql b/company_linkage/sql/visualization_data_with_top_papers.sql
index cf5e0d47..7277bd51 100644
--- a/company_linkage/sql/visualization_data_with_top_papers.sql
+++ b/company_linkage/sql/visualization_data_with_top_papers.sql
@@ -1,6 +1,4 @@
   -- Update the visualization table itself to add top paper data
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.visualization_data AS
   -- Pull in the top paper counts, along with the CSET ids to link them in
 WITH
   count_data AS (
@@ -9,13 +7,13 @@ WITH
     ai_pubs_in_top_conferences,
     ai_pubs_in_top_conferences_by_year,
   FROM
-    `gcp-cset-projects.ai_companies_visualization.top_paper_counts`),
-  -- Pull in the current visualization data. Exclude the ai_pubs_in_top_conferences data, since that was included when we built the top paper data, so we don't need it
+    staging_ai_companies_visualization.top_paper_counts),
+  -- Pull in the current visualization data.
   viz_data AS (
   SELECT
     *
   FROM
-    `gcp-cset-projects.ai_companies_visualization.visualization_data`)
+    staging_ai_companies_visualization.visualization_data_with_by_year)
   -- Join the two together using the CSET id
 SELECT
   viz_data.*,

From d06ff09e12cece66eb044ebf7de245e8d9650416 Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Fri, 13 Oct 2023 11:13:06 -0400
Subject: [PATCH 07/17] Update paper visualization other than tasks+methods

---
 .../sql/initial_paper_visualization_data.sql     | 10 +++++-----
 .../paper_visualization_data_with_clusters.sql   | 10 ++++------
 ...isualization_data_with_company_references.sql | 16 +++++++---------
 .../sql/paper_visualization_data_with_mag.sql    | 14 ++++++--------
 4 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/company_linkage/sql/initial_paper_visualization_data.sql b/company_linkage/sql/initial_paper_visualization_data.sql
index ba20b588..57d83813 100644
--- a/company_linkage/sql/initial_paper_visualization_data.sql
+++ b/company_linkage/sql/initial_paper_visualization_data.sql
@@ -2,14 +2,14 @@ WITH
   get_citations AS (
   SELECT
     DISTINCT CSET_id,
-    refs_merged.merged_id,
+    references.merged_id,
     ref_id
   FROM
-    ai_companies_visualization.ai_company_pubs
+    staging_ai_companies_visualization.ai_company_papers
   LEFT JOIN
-    `gcp-cset-projects.gcp_cset_links_v2.paper_references_merged` refs_merged
+    literature.references
   ON
-    (ai_company_pubs.merged_id = ref_id)),
+    (ai_company_papers.merged_id = ref_id)),
   add_year AS (
   SELECT
     DISTINCT CSET_id,
@@ -19,7 +19,7 @@ WITH
   FROM
     get_citations
   LEFT JOIN
-    gcp_cset_links_v2.corpus_merged
+    literature.papers
   USING
     (merged_id)
   WHERE
diff --git a/company_linkage/sql/paper_visualization_data_with_clusters.sql b/company_linkage/sql/paper_visualization_data_with_clusters.sql
index 320490ea..637843b2 100644
--- a/company_linkage/sql/paper_visualization_data_with_clusters.sql
+++ b/company_linkage/sql/paper_visualization_data_with_clusters.sql
@@ -1,5 +1,3 @@
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.paper_visualization_data AS
 WITH
   company_cluster_assignment AS (
   SELECT
@@ -7,9 +5,9 @@ WITH
     merged_id,
     cluster_id
   FROM
-    ai_companies_visualization.ai_company_pubs
+    staging_ai_companies_visualization.ai_company_papers
   LEFT JOIN
-    `gcp-cset-projects.science_map_v2.dc5_cluster_assignment_stable`
+    map_of_science.cluster_assignment
   USING
     (merged_id)
   WHERE
@@ -36,10 +34,10 @@ WITH
   GROUP BY
     CSET_id)
 SELECT
-  paper_visualization_data.*,
+  paper_visualization_data_with_mag.*,
   clusters
 FROM
-  ai_companies_visualization.paper_visualization_data
+  staging_ai_companies_visualization.paper_visualization_data_with_mag
 LEFT JOIN
   aggregated_clusters
 USING
diff --git a/company_linkage/sql/paper_visualization_data_with_company_references.sql b/company_linkage/sql/paper_visualization_data_with_company_references.sql
index 2b49b118..f935f350 100644
--- a/company_linkage/sql/paper_visualization_data_with_company_references.sql
+++ b/company_linkage/sql/paper_visualization_data_with_company_references.sql
@@ -1,5 +1,3 @@
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.paper_visualization_data AS
 -- First get all the articles cited by the AI papers written by our companies
 WITH
   get_references AS (
@@ -8,9 +6,9 @@ WITH
     merged_id,
     ref_id
   FROM
-    ai_companies_visualization.ai_company_pubs
+    staging_ai_companies_visualization.ai_company_papers
   LEFT JOIN
-    `gcp-cset-projects.gcp_cset_links_v2.paper_references_merged`
+    literature.references
   USING
     (merged_id)),
   referenced_companies AS (
@@ -18,13 +16,13 @@ WITH
     DISTINCT get_references.CSET_id,
     get_references.merged_id,
     ref_id,
-    ai_company_pubs.CSET_id AS ref_CSET_id
+    ai_company_papers.CSET_id AS ref_CSET_id
   FROM
     get_references
   INNER JOIN
-    ai_companies_visualization.ai_company_pubs
+    staging_ai_companies_visualization.ai_company_papers
   ON
-    ref_id = ai_company_pubs.merged_id
+    ref_id = ai_company_papers.merged_id
   ORDER BY
     CSET_id),
   count_company_refs AS (
@@ -54,10 +52,10 @@ GROUP BY
 ORDER BY
   CSET_id)
   SELECT
-  paper_visualization_data.*,
+  paper_visualization_data_with_clusters.*,
   company_references
 FROM
-  ai_companies_visualization.paper_visualization_data
+  staging_ai_companies_visualization.paper_visualization_data_with_clusters
 LEFT JOIN
   aggregated_refs
 USING
diff --git a/company_linkage/sql/paper_visualization_data_with_mag.sql b/company_linkage/sql/paper_visualization_data_with_mag.sql
index 7223d747..46e93f76 100644
--- a/company_linkage/sql/paper_visualization_data_with_mag.sql
+++ b/company_linkage/sql/paper_visualization_data_with_mag.sql
@@ -1,19 +1,17 @@
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.paper_visualization_data AS
 WITH
   names AS (
   SELECT
     field_id AS child_field_id,
     name
   FROM
-    `gcp-cset-projects.fields_of_study.field_meta`),
+    fields_of_study.field_meta),
   ai_subfields AS (
   SELECT
     field_id,
     child_field_id,
     name AS child_name
   FROM
-    `gcp-cset-projects.fields_of_study.field_children`
+    fields_of_study.field_children
   LEFT JOIN
     names
   USING
@@ -43,7 +41,7 @@ WITH
     field.id AS field_id,
     field.name AS field_name
   FROM
-    `gcp-cset-projects.fields_of_study.top_fields`
+    fields_of_study.top_fields
   CROSS JOIN
     UNNEST(fields) AS field
   INNER JOIN
@@ -59,7 +57,7 @@ WITH
     field_id,
     field_name
   FROM
-    ai_companies_visualization.ai_company_pubs
+    staging_ai_companies_visualization.ai_company_papers
   LEFT JOIN
     articles_with_ai_subfields
   USING
@@ -88,10 +86,10 @@ WITH
   GROUP BY
     CSET_id)
 SELECT
-  paper_visualization_data.*,
+  initial_paper_visualization_data.*,
   fields
 FROM
-  ai_companies_visualization.paper_visualization_data
+  staging_ai_companies_visualization.initial_paper_visualization_data
 LEFT JOIN
   aggregated_fields
 USING

From 61d0e4680db9fc3007c5d40ddf9a7fe12d69cb20 Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Fri, 13 Oct 2023 11:21:29 -0400
Subject: [PATCH 08/17] Update patent visualization data

---
 company_linkage/data/omit.csv                 | 40 -------------------
 .../sql/initial_patent_visualization_data.sql |  4 +-
 ...patent_visualization_data_with_by_year.sql |  8 ++--
 3 files changed, 5 insertions(+), 47 deletions(-)
 delete mode 100644 company_linkage/data/omit.csv

diff --git a/company_linkage/data/omit.csv b/company_linkage/data/omit.csv
deleted file mode 100644
index 5e432fc3..00000000
--- a/company_linkage/data/omit.csv
+++ /dev/null
@@ -1,40 +0,0 @@
-CSET_id
-100
-296
-346
-374
-380
-386
-412
-418
-464
-467
-495
-612
-628
-633
-649
-724
-728
-756
-767
-2287
-2774
-2778
-2784
-2789
-2806
-2815
-2831
-2850
-2851
-2855
-2875
-2922
-2956
-2976
-2977
-2981
-2987
-3036
-3058
\ No newline at end of file
diff --git a/company_linkage/sql/initial_patent_visualization_data.sql b/company_linkage/sql/initial_patent_visualization_data.sql
index 7720057f..f5ef5a68 100644
--- a/company_linkage/sql/initial_patent_visualization_data.sql
+++ b/company_linkage/sql/initial_patent_visualization_data.sql
@@ -4,7 +4,7 @@ WITH
   SELECT
     *
   FROM
-    ai_companies_visualization.ai_company_patents),
+    staging_ai_companies_visualization.ai_company_patents),
   pattable AS (
     -- Getting the count of patents
   SELECT
@@ -90,7 +90,7 @@ SELECT
   COALESCE(Machine_Learning_pats, 0) as Machine_Learning_pats,
   COALESCE(Search_Methods_pats, 0) as Search_Methods_pats,
 FROM
-  `gcp-cset-projects.high_resolution_entities.aggregated_organizations` AS orgs
+  high_resolution_entities.aggregated_organizations
 LEFT JOIN
   pattable
 USING
diff --git a/company_linkage/sql/patent_visualization_data_with_by_year.sql b/company_linkage/sql/patent_visualization_data_with_by_year.sql
index 5804c8f9..06ed1457 100644
--- a/company_linkage/sql/patent_visualization_data_with_by_year.sql
+++ b/company_linkage/sql/patent_visualization_data_with_by_year.sql
@@ -1,12 +1,10 @@
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.patent_visualization_data AS
 WITH
   aipats AS (
     -- Pulling all the patents from any of our companies
   SELECT
     *
   FROM
-    ai_companies_visualization.ai_company_patents),
+    staging_ai_companies_visualization.ai_company_patents),
   pattable AS (
     -- Getting the count of patents
   SELECT
@@ -202,7 +200,7 @@ WITH
       priority_year) AS Search_Methods_pats_by_year,
 
   FROM
-    `gcp-cset-projects.high_resolution_entities.aggregated_organizations` AS orgs
+    high_resolution_entities.aggregated_organizations
   LEFT JOIN
     pattable
   USING
@@ -215,7 +213,7 @@ SELECT
   viz.*,
   by_year.* EXCEPT (CSET_id)
 FROM
-  `gcp-cset-projects.ai_companies_visualization.patent_visualization_data` AS viz
+  staging_ai_companies_visualization.initial_patent_visualization_data AS viz
 LEFT JOIN
   by_year
 USING

From acebb36357963469e76460b70662ab6f5005ee6a Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Fri, 13 Oct 2023 11:30:56 -0400
Subject: [PATCH 09/17] Update company omission SQL

---
 company_linkage/sql/paper_visualization_data.sql       | 10 ++++------
 company_linkage/sql/patent_visualization_data.sql      | 10 ++++------
 .../sql/visualization_data_omit_by_rule.sql            | 10 ++++------
 company_linkage/sql/workforce_visualization_data.sql   | 10 ++++------
 4 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/company_linkage/sql/paper_visualization_data.sql b/company_linkage/sql/paper_visualization_data.sql
index 00e03c81..122b58a8 100644
--- a/company_linkage/sql/paper_visualization_data.sql
+++ b/company_linkage/sql/paper_visualization_data.sql
@@ -1,20 +1,18 @@
-CREATE OR REPLACE TABLE
-  `gcp-cset-projects.ai_companies_visualization.paper_visualization_data` AS
   -- Selecting the companies we want to leave out
 WITH
   to_omit AS (
   SELECT
     CSET_id
   FROM
-    ai_companies_visualization.visualization_data
+    staging_ai_companies_visualization.visualization_data_omit_by_year
   RIGHT JOIN
-    ai_companies_visualization.paper_visualization_data
+    staging_ai_companies_visualization.paper_visualization_data_with_methods
   USING (cset_id)
-  WHERE visualization_data.cset_id IS NULL)
+  WHERE visualization_data_omit_by_year.cset_id IS NULL)
 SELECT
   *
 FROM
-  `gcp-cset-projects.ai_companies_visualization.paper_visualization_data`
+  staging_ai_companies_visualization.paper_visualization_data_with_methods
 WHERE
   CSET_id NOT IN (
   SELECT
diff --git a/company_linkage/sql/patent_visualization_data.sql b/company_linkage/sql/patent_visualization_data.sql
index 8781112e..29818a23 100644
--- a/company_linkage/sql/patent_visualization_data.sql
+++ b/company_linkage/sql/patent_visualization_data.sql
@@ -1,20 +1,18 @@
-CREATE OR REPLACE TABLE
-  `gcp-cset-projects.ai_companies_visualization.patent_visualization_data` AS
   -- Selecting the companies we want to leave out
 WITH
   to_omit AS (
   SELECT
     CSET_id
   FROM
-    ai_companies_visualization.visualization_data
+    staging_ai_companies_visualization.visualization_data_omit_by_rule
   RIGHT JOIN
-    ai_companies_visualization.patent_visualization_data
+    staging_ai_companies_visualization.patent_visualization_data_with_by_year
   USING (cset_id)
-  WHERE visualization_data.cset_id IS NULL)
+  WHERE visualization_data_omit_by_rule.cset_id IS NULL)
 SELECT
   *
 FROM
-  `gcp-cset-projects.ai_companies_visualization.patent_visualization_data`
+  staging_ai_companies_visualization.patent_visualization_data_with_by_year
 WHERE
   CSET_id NOT IN (
   SELECT
diff --git a/company_linkage/sql/visualization_data_omit_by_rule.sql b/company_linkage/sql/visualization_data_omit_by_rule.sql
index 597e871c..bef7cb15 100644
--- a/company_linkage/sql/visualization_data_omit_by_rule.sql
+++ b/company_linkage/sql/visualization_data_omit_by_rule.sql
@@ -1,17 +1,15 @@
-CREATE OR REPLACE TABLE
-  `gcp-cset-projects.ai_companies_visualization.visualization_data` AS
   -- Selecting the companies we want to leave out
 WITH
   to_omit AS (
   SELECT
     CSET_id
   FROM
-    ai_companies_visualization.visualization_data
+    staging_ai_companies_visualization.visualization_data_with_all_papers
   LEFT JOIN
-    ai_companies_visualization.patent_visualization_data
+    staging_ai_companies_visualization.patent_visualization_data_with_by_year
   USING (cset_id)
   LEFT JOIN
-    ai_companies_visualization.workforce_visualization_data
+    staging_ai_companies_visualization.workforce_visualization_data_with_ai_jobs
   USING
     (cset_id)
   WHERE
@@ -32,7 +30,7 @@ WITH
 SELECT
   *
 FROM
-  `gcp-cset-projects.ai_companies_visualization.visualization_data`
+  staging_ai_companies_visualization.visualization_data_with_all_papers
 WHERE
   CSET_id NOT IN (
   SELECT
diff --git a/company_linkage/sql/workforce_visualization_data.sql b/company_linkage/sql/workforce_visualization_data.sql
index 40a25b3c..3779e5e3 100644
--- a/company_linkage/sql/workforce_visualization_data.sql
+++ b/company_linkage/sql/workforce_visualization_data.sql
@@ -1,20 +1,18 @@
-CREATE OR REPLACE TABLE
-  `gcp-cset-projects.ai_companies_visualization.workforce_visualization_data` AS
   -- Selecting the companies we want to leave out
 WITH
   to_omit AS (
   SELECT
     CSET_id
   FROM
-    ai_companies_visualization.visualization_data
+    staging_ai_companies_visualization.visualization_data_omit_by_rule
   RIGHT JOIN
-    ai_companies_visualization.workforce_visualization_data
+    staging_ai_companies_visualization.workforce_visualization_data_with_ai_jobs
   USING (cset_id)
-  WHERE visualization_data.cset_id IS NULL)
+  WHERE visualization_data_omit_by_rule.cset_id IS NULL)
 SELECT
   *
 FROM
-  `gcp-cset-projects.ai_companies_visualization.workforce_visualization_data`
+  staging_ai_companies_visualization.workforce_visualization_data_with_ai_jobs
 WHERE
   CSET_id NOT IN (
   SELECT

From cc60fb2aa45c0ae34414d8a612f86a0121e3b021 Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Fri, 13 Oct 2023 12:45:59 -0400
Subject: [PATCH 10/17] Update crunchbase linkage

---
 company_linkage/sql/visualization_data.sql | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/company_linkage/sql/visualization_data.sql b/company_linkage/sql/visualization_data.sql
index 1f48d965..ad7022b7 100644
--- a/company_linkage/sql/visualization_data.sql
+++ b/company_linkage/sql/visualization_data.sql
@@ -1,14 +1,12 @@
   -- We're adding useful Crunchbase data to the visualization: descriptions, logos, and the company's "stage"
   -- (which we're using as a proxy for its size/growth but is actually based on what funding it has received).
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.visualization_data AS
 WITH
   -- Pull in all the visualization data, most importantly including the crunchbase uuid that will be used to connect to everything else
   visualization AS (
   SELECT
     *
   FROM
-    `gcp-cset-projects.ai_companies_visualization.visualization_data`),
+    staging_ai_companies_visualization.visualization_data_omit_by_rule),
   -- Grab the descriptions and logos from Crunchbase ODM
   odm_data AS (
   SELECT
@@ -16,7 +14,7 @@ WITH
     short_description,
     logo_url
   FROM
-    `gcp-cset-projects.gcp_cset_crunchbase.organizations_odm`),
+    gcp_cset_crunchbase.organizations_odm),
   -- Grab the raw stage data for companies
   -- Since companies have multiple funding rounds they may have multiple rows!
   -- We need to deal with this
@@ -51,7 +49,7 @@ WITH
   END
     AS stage
   FROM
-    `gcp-cset-projects.gcp_cset_crunchbase.funding_rounds`),
+    gcp_cset_crunchbase.funding_rounds),
   -- Now we want only one stage value to come out for any given company
   -- If a company has ever been mature, it's no longer growth or startup, etc.
   -- So there's a clear hierarchy, and we take the max
@@ -78,9 +76,9 @@ WITH
   FROM
     combine_stages
   LEFT JOIN
-    gcp_cset_crunchbase.organizations orgs
+    gcp_cset_crunchbase.organizations
   ON
-    combine_stages.org_uuid = orgs.uuid ),
+    combine_stages.org_uuid = organizations.uuid ),
   stage_name AS (
   SELECT
     org_uuid,
@@ -111,7 +109,7 @@ FROM
 LEFT JOIN
   odm_data
 ON
-  visualization.crunchbase.crunchbase_uuid = odm_data.uuid
+  TRIM(visualization.crunchbase.crunchbase_uuid) = TRIM(odm_data.uuid)
 LEFT JOIN
   stage_name
 ON

From 28a0937a547de98e626f121ecb125feb34704223 Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Fri, 13 Oct 2023 16:37:24 -0400
Subject: [PATCH 11/17] Reorder readme to reflect new ordering

---
 company_linkage/README.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/company_linkage/README.md b/company_linkage/README.md
index e3248a3b..6a585284 100644
--- a/company_linkage/README.md
+++ b/company_linkage/README.md
@@ -33,23 +33,23 @@ run some of this code as-is.
 15. Upload `all_paper_counts.jsonl` to `ai_companies_visualization.total_paper_counts` using the [all_papers_schema](schemas/all_papers_schema.json)
 16. [initial_visualization_data.sql](sql/initial_visualization_data.sql)
 17. [visualization_data_with_by_year.sql](sql/visualization_data_with_by_year.sql)
-18. [initial_patent_visualization_data.sql](sql/initial_patent_visualization_data.sql)
-19. [patent_visualization_data_with_by_year.sql](sql/patent_visualization_data_with_by_year.sql)
-20. [initial_paper_visualization_data.sql](sql/initial_paper_visualization_data.sql)
-21. [paper_visualization_data_with_mag.sql](sql/paper_visualization_data_with_mag.sql)
-22. [paper_visualization_data_with_clusters.sql](sql/paper_visualization_data_with_clusters.sql)
-23. [paper_visualization_data_with_company_references.sql](sql/paper_visualization_data_with_company_references.sql)
-24. [paper_visualization_data_with_tasks.sql](sql/paper_visualization_data_with_tasks.sql)
-25. [paper_visualization_data_with_methods.sql](sql/paper_visualization_data_with_methods.sql)
-26. [visualization_data_with_top_papers.sql](sql/visualization_data_with_top_papers.sql)
-27. [visualization_data_with_all_papers.sql](sql/visualization_data_with_all_papers.sql)
+18. [visualization_data_with_top_papers.sql](sql/visualization_data_with_top_papers.sql)
+19. [visualization_data_with_all_papers.sql](sql/visualization_data_with_all_papers.sql)
+20. [initial_patent_visualization_data.sql](sql/initial_patent_visualization_data.sql)
+21. [patent_visualization_data_with_by_year.sql](sql/patent_visualization_data_with_by_year.sql)
+22. [initial_paper_visualization_data.sql](sql/initial_paper_visualization_data.sql)
+23. [paper_visualization_data_with_mag.sql](sql/paper_visualization_data_with_mag.sql)
+24. [paper_visualization_data_with_clusters.sql](sql/paper_visualization_data_with_clusters.sql)
+25. [paper_visualization_data_with_company_references.sql](sql/paper_visualization_data_with_company_references.sql)
+26. [paper_visualization_data_with_tasks.sql](sql/paper_visualization_data_with_tasks.sql)
+27. [paper_visualization_data_with_methods.sql](sql/paper_visualization_data_with_methods.sql)
 28. [initial_workforce_visualization_data.sql](sql/initial_workforce_visualization_data.sql)
 29. [workforce_visualization_data_with_ai_jobs.sql](sql/workforce_visualization_data_with_ai_jobs.sql)
 30. [visualization_data_omit_by_rule.sql](sql/visualization_data_omit_by_rule.sql)
-31. [paper_visualization_data.sql](sql/paper_visualization_data.sql)
+31. [visualization_data.sql](sql/visualization_data.sql)
 32. [patent_visualization_data.sql](sql/patent_visualization_data.sql)
-33. [workforce_visualization_data.sql](sql/workforce_visualization_data.sql)
-34. [visualization_data.sql](sql/visualization_data.sql)
+33. [paper_visualization_data.sql](sql/paper_visualization_data.sql)
+34. [workforce_visualization_data.sql](sql/workforce_visualization_data.sql)
 
 # Deployment
 

From 02f7ee1d45cc30982c085478028e2f520d895426 Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Mon, 16 Oct 2023 10:27:33 -0400
Subject: [PATCH 12/17] Add some initial check queries

---
 .../sql/check_all_paper_counts_greater.sql    |  5 +++
 .../sql/check_paper_counts_exist.sql          | 11 +++++
 .../sql/check_paper_have_all_ids_pre_omit.sql | 11 +++++
 .../sql/check_patent_counts_exist.sql         | 40 +++++++++++++++++++
 .../check_patent_have_all_ids_pre_omit.sql    | 11 +++++
 ...ck_visualization_have_all_ids_pre_omit.sql | 11 +++++
 .../check_workforce_have_all_ids_pre_omit.sql | 11 +++++
 7 files changed, 100 insertions(+)
 create mode 100644 company_linkage/sql/check_all_paper_counts_greater.sql
 create mode 100644 company_linkage/sql/check_paper_counts_exist.sql
 create mode 100644 company_linkage/sql/check_paper_have_all_ids_pre_omit.sql
 create mode 100644 company_linkage/sql/check_patent_counts_exist.sql
 create mode 100644 company_linkage/sql/check_patent_have_all_ids_pre_omit.sql
 create mode 100644 company_linkage/sql/check_visualization_have_all_ids_pre_omit.sql
 create mode 100644 company_linkage/sql/check_workforce_have_all_ids_pre_omit.sql

diff --git a/company_linkage/sql/check_all_paper_counts_greater.sql b/company_linkage/sql/check_all_paper_counts_greater.sql
new file mode 100644
index 00000000..f3e20b8b
--- /dev/null
+++ b/company_linkage/sql/check_all_paper_counts_greater.sql
@@ -0,0 +1,5 @@
+SELECT
+  LOGICAL_AND(all_pubs >= ai_pubs)
+  AND LOGICAL_AND(all_pubs >= ai_pubs_in_top_conferences)
+FROM
+  staging_ai_companies_visualization.visualization_data_with_all_papers
\ No newline at end of file
diff --git a/company_linkage/sql/check_paper_counts_exist.sql b/company_linkage/sql/check_paper_counts_exist.sql
new file mode 100644
index 00000000..4171ab87
--- /dev/null
+++ b/company_linkage/sql/check_paper_counts_exist.sql
@@ -0,0 +1,11 @@
+SELECT
+  COUNT(*) = 0
+FROM
+  staging_ai_companies_visualization.visualization_data_with_all_papers
+WHERE
+  ai_pubs IS NULL
+  OR robotics_pubs IS NULL
+  OR cv_pubs IS NULL
+  OR nlp_pubs IS NULL
+  OR ai_pubs_in_top_conferences IS NULL
+  OR all_pubs IS NULL
\ No newline at end of file
diff --git a/company_linkage/sql/check_paper_have_all_ids_pre_omit.sql b/company_linkage/sql/check_paper_have_all_ids_pre_omit.sql
new file mode 100644
index 00000000..3b33804a
--- /dev/null
+++ b/company_linkage/sql/check_paper_have_all_ids_pre_omit.sql
@@ -0,0 +1,11 @@
+-- Check that the paper visualization data table has all the CSET organization ids in the table before
+-- we run omit by rule
+SELECT
+  COUNT(DISTINCT paper_visualization_data_with_methods.CSET_id) = COUNT(DISTINCT aggregated_organizations.CSET_id)
+  AND LOGICAL_AND(paper_visualization_data_with_methods.CSET_id IS NOT NULL)
+FROM
+  staging_ai_companies_visualization.paper_visualization_data_with_methods
+FULL OUTER JOIN
+  high_resolution_entities.aggregated_organizations
+USING
+  (CSET_id)
\ No newline at end of file
diff --git a/company_linkage/sql/check_patent_counts_exist.sql b/company_linkage/sql/check_patent_counts_exist.sql
new file mode 100644
index 00000000..e90821f6
--- /dev/null
+++ b/company_linkage/sql/check_patent_counts_exist.sql
@@ -0,0 +1,40 @@
+SELECT
+  COUNT(*) = 0
+FROM
+  staging_ai_companies_visualization.patent_visualization_data_with_by_year
+WHERE
+  ai_patents IS NULL
+  OR Physical_Sciences_and_Engineering_pats IS NULL
+  OR Life_Sciences_pats IS NULL
+  OR Security__eg_cybersecurity_pats IS NULL
+  OR Transportation_pats IS NULL
+  OR Education_pats IS NULL
+  OR Document_Mgt_and_Publishing_pats IS NULL
+  OR Military_pats IS NULL
+  OR Agricultural_pats IS NULL
+  OR Computing_in_Government_pats IS NULL
+  OR Personal_Devices_and_Computing_pats IS NULL
+  OR Banking_and_Finance_pats IS NULL
+  OR Telecommunications_pats IS NULL
+  OR Networks__eg_social_IOT_etc_pats IS NULL
+  OR Business_pats IS NULL
+  OR Energy_Management_pats IS NULL
+  OR Entertainment_pats IS NULL
+  OR Nanotechnology_pats IS NULL
+  OR Semiconductors_pats IS NULL
+  OR Language_Processing_pats IS NULL
+  OR Speech_Processing_pats IS NULL
+  OR Knowledge_Representation_pats IS NULL
+  OR Planning_and_Scheduling_pats IS NULL
+  OR Control_pats IS NULL
+  OR Distributed_AI_pats IS NULL
+  OR Robotics_pats IS NULL
+  OR Computer_Vision_pats IS NULL
+  OR Analytics_and_Algorithms_pats IS NULL
+  OR Measuring_and_Testing_pats IS NULL
+  OR Logic_Programming_pats IS NULL
+  OR Fuzzy_Logic_pats IS NULL
+  OR Probabilistic_Reasoning_pats IS NULL
+  OR Ontology_Engineering_pats IS NULL
+  OR Machine_Learning_pats IS NULL
+  OR Search_Methods_pats IS NULL
\ No newline at end of file
diff --git a/company_linkage/sql/check_patent_have_all_ids_pre_omit.sql b/company_linkage/sql/check_patent_have_all_ids_pre_omit.sql
new file mode 100644
index 00000000..3aa42d29
--- /dev/null
+++ b/company_linkage/sql/check_patent_have_all_ids_pre_omit.sql
@@ -0,0 +1,11 @@
+-- Check that the patent visualization data table has all the CSET organization ids in the table before
+-- we run omit by rule
+SELECT
+  COUNT(DISTINCT patent_visualization_data_with_by_year.CSET_id) = COUNT(DISTINCT aggregated_organizations.CSET_id)
+  AND LOGICAL_AND(patent_visualization_data_with_by_year.CSET_id IS NOT NULL)
+FROM
+  staging_ai_companies_visualization.patent_visualization_data_with_by_year
+FULL OUTER JOIN
+  high_resolution_entities.aggregated_organizations
+USING
+  (CSET_id)
\ No newline at end of file
diff --git a/company_linkage/sql/check_visualization_have_all_ids_pre_omit.sql b/company_linkage/sql/check_visualization_have_all_ids_pre_omit.sql
new file mode 100644
index 00000000..cd5532fb
--- /dev/null
+++ b/company_linkage/sql/check_visualization_have_all_ids_pre_omit.sql
@@ -0,0 +1,11 @@
+-- Check that the visualization data table has all the CSET organization ids in the table before
+-- we run omit by rule
+SELECT
+  COUNT(DISTINCT visualization_data_with_all_papers.CSET_id) = COUNT(DISTINCT aggregated_organizations.CSET_id)
+  AND LOGICAL_AND(visualization_data_with_all_papers.CSET_id IS NOT NULL)
+FROM
+  staging_ai_companies_visualization.visualization_data_with_all_papers
+FULL OUTER JOIN
+  high_resolution_entities.aggregated_organizations
+USING
+  (CSET_id)
\ No newline at end of file
diff --git a/company_linkage/sql/check_workforce_have_all_ids_pre_omit.sql b/company_linkage/sql/check_workforce_have_all_ids_pre_omit.sql
new file mode 100644
index 00000000..fb9aaeb2
--- /dev/null
+++ b/company_linkage/sql/check_workforce_have_all_ids_pre_omit.sql
@@ -0,0 +1,11 @@
+-- Check that the workforce visualization data table has all the CSET organization ids in the table before
+-- we run omit by rule
+SELECT
+  COUNT(DISTINCT workforce_visualization_data_with_ai_jobs.CSET_id) = COUNT(DISTINCT aggregated_organizations.CSET_id)
+  AND LOGICAL_AND(workforce_visualization_data_with_ai_jobs.CSET_id IS NOT NULL)
+FROM
+  staging_ai_companies_visualization.workforce_visualization_data_with_ai_jobs
+FULL OUTER JOIN
+  high_resolution_entities.aggregated_organizations
+USING
+  (CSET_id)
\ No newline at end of file

From 8a00cd0f4573f10712384b2b9432e088c53262c8 Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Tue, 17 Oct 2023 11:53:02 -0400
Subject: [PATCH 13/17] Switch name to merged_id in predictions table

---
 company_linkage/sql/ai_publications.sql | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/company_linkage/sql/ai_publications.sql b/company_linkage/sql/ai_publications.sql
index 2aacb37f..0a000a1d 100644
--- a/company_linkage/sql/ai_publications.sql
+++ b/company_linkage/sql/ai_publications.sql
@@ -1,10 +1,7 @@
-  -- Pulling every AI-associated publication id linked to every grid id and every organization name
-  -- We also include years because we'll want those later for yearly counts
-  -- and cv/robotics/nlp so we can filter on these
 WITH
   ai_papers AS (
   SELECT
-    cset_id AS merged_id,
+    merged_id,
     cv_filtered,
     nlp_filtered,
     robotics_filtered

From 5fa80da27c2bb9aec15534cd68e831f9b715a51c Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Fri, 1 Dec 2023 15:17:22 -0500
Subject: [PATCH 14/17] Update for tasks and methods

---
 .../sql/paper_visualization_data_with_methods.sql         | 8 +++-----
 .../sql/paper_visualization_data_with_tasks.sql           | 8 +++-----
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/company_linkage/sql/paper_visualization_data_with_methods.sql b/company_linkage/sql/paper_visualization_data_with_methods.sql
index 853177cd..991a122e 100644
--- a/company_linkage/sql/paper_visualization_data_with_methods.sql
+++ b/company_linkage/sql/paper_visualization_data_with_methods.sql
@@ -1,12 +1,10 @@
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.paper_visualization_data AS
 WITH
   articles_with_ai_methods AS (
   SELECT
     DISTINCT merged_id,
     referent,
   FROM
-    `gcp-cset-projects.tasks_and_methods.method_referents`
+    tasks_and_methods.method_referents
   CROSS JOIN
     UNNEST(referents) AS referent),
   company_articles_with_methods AS (
@@ -44,10 +42,10 @@ WITH
   GROUP BY
     CSET_id)
 SELECT
-  paper_visualization_data.*,
+  paper_visualization_data_with_tasks.*,
   methods
 FROM
-  ai_companies_visualization.paper_visualization_data
+  ai_companies_visualization.paper_visualization_data_with_tasks
 LEFT JOIN
   aggregated_fields
 USING
diff --git a/company_linkage/sql/paper_visualization_data_with_tasks.sql b/company_linkage/sql/paper_visualization_data_with_tasks.sql
index afaf256c..b2c6b157 100644
--- a/company_linkage/sql/paper_visualization_data_with_tasks.sql
+++ b/company_linkage/sql/paper_visualization_data_with_tasks.sql
@@ -1,12 +1,10 @@
-CREATE OR REPLACE TABLE
-  ai_companies_visualization.paper_visualization_data AS
 WITH
   articles_with_ai_tasks AS (
   SELECT
     DISTINCT merged_id,
     referent,
   FROM
-    `gcp-cset-projects.tasks_and_methods.task_referents`
+    tasks_and_methods.task_referents
   CROSS JOIN
     UNNEST(referents) AS referent),
   company_articles_with_tasks AS (
@@ -44,10 +42,10 @@ WITH
   GROUP BY
     CSET_id)
 SELECT
-  paper_visualization_data.*,
+  paper_visualization_data_with_company_references.*,
   tasks
 FROM
-  ai_companies_visualization.paper_visualization_data
+  ai_companies_visualization.paper_visualization_data_with_company_references
 LEFT JOIN
   aggregated_fields
 USING

From a9d54cb0f8acdce54c745da200db1b407bc8d01a Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Mon, 4 Dec 2023 17:57:33 -0500
Subject: [PATCH 15/17] Finalize pipeline; fix idempotency; add schemas

---
 company_linkage/parat_data_dag.py             |  63 +-
 .../schemas/paper_visualization_data.json     | 128 +++
 .../schemas/patent_visualization_data.json    | 950 ++++++++++++++++++
 .../schemas/visualization_data.json           | 376 +++++++
 .../schemas/workforce_visualization_data.json |  20 +
 .../sql/initial_paper_visualization_data.sql  |  16 +-
 .../initial_workforce_visualization_data.sql  |  17 +-
 .../sql/paper_visualization_data.sql          |   4 +-
 .../paper_visualization_data_with_methods.sql |   4 +-
 .../paper_visualization_data_with_tasks.sql   |   4 +-
 ...kforce_visualization_data_with_ai_jobs.sql |  23 +-
 11 files changed, 1581 insertions(+), 24 deletions(-)
 create mode 100644 company_linkage/schemas/paper_visualization_data.json
 create mode 100644 company_linkage/schemas/patent_visualization_data.json
 create mode 100644 company_linkage/schemas/visualization_data.json
 create mode 100644 company_linkage/schemas/workforce_visualization_data.json

diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py
index 0bde5f58..c3d3d3e5 100644
--- a/company_linkage/parat_data_dag.py
+++ b/company_linkage/parat_data_dag.py
@@ -4,10 +4,11 @@
 from airflow import DAG
 from airflow.operators.python import PythonOperator
 from airflow.operators.trigger_dagrun import TriggerDagRunOperator
-from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator
+from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator, BigQueryCheckOperator
 from airflow.providers.google.cloud.operators.cloud_sql import (
     CloudSQLImportInstanceOperator,
 )
+from airflow.providers.google.cloud.transfers.bigquery_to_bigquery import BigQueryToBigQueryOperator
 from airflow.providers.google.cloud.operators.kubernetes_engine import GKEStartPodOperator
 from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator
 from airflow.operators.dummy import DummyOperator
@@ -19,10 +20,12 @@
     DATA_BUCKET,
     PROJECT_ID,
     GCP_ZONE,
+    DAGS_DIR,
     get_default_args,
     get_post_success,
 )
 from dataloader.scripts.populate_documentation import update_table_descriptions
+
 from parat_scripts.aggregate_organizations import aggregate_organizations
 
 bucket = DATA_BUCKET
@@ -30,6 +33,7 @@
 intermediate_dataset = "high_resolution_entities"
 production_dataset = "ai_companies_visualization"
 staging_dataset = f"staging_{production_dataset}"
+backups_dataset = f"{production_dataset}_backups"
 sql_dir = "sql/parat"
 schema_dir = "parat/schemas"
 tmp_dir = f"{production_dataset}/tmp"
@@ -49,7 +53,8 @@
         "staging_dataset": staging_dataset,
         "production_dataset": production_dataset,
         "intermediate_dataset": intermediate_dataset,
-        "initial_dataset": initial_dataset
+        "initial_dataset": initial_dataset,
+        "backups_dataset": backups_dataset,
     },
 )
 with dag:
@@ -293,6 +298,51 @@
         curr = next_tab
     curr >> wait_for_visualization_tables
 
+    checks = []
+    for query in os.listdir(f"{DAGS_DIR}/{sql_dir}"):
+        if not query.startswith("check_"):
+            continue
+        checks.append(BigQueryCheckOperator(
+            task_id=query.replace(".sql", ""),
+            sql=f"{sql_dir}/{query}",
+            use_legacy_sql=False
+        ))
+
+    wait_for_checks = DummyOperator(task_id="wait_for_checks")
+
+    wait_for_copy = DummyOperator(task_id="wait_for_copy")
+
+    curr_date = datetime.now().strftime('%Y%m%d')
+    prod_tables = ["visualization_data", "paper_visualization_data",
+                   "patent_visualization_data", "workforce_visualization_data"]
+    for table in prod_tables:
+        prod_table_name = f"{production_dataset}.{table}"
+        copy_to_production = BigQueryToBigQueryOperator(
+            task_id="copy_" + table + "_to_production",
+            source_project_dataset_tables=[staging_dataset + "." + table],
+            destination_project_dataset_table=prod_table_name,
+            create_disposition="CREATE_IF_NEEDED",
+            write_disposition="WRITE_TRUNCATE"
+        )
+        pop_descriptions = PythonOperator(
+            task_id="populate_column_documentation_for_" + table,
+            op_kwargs={
+                "input_schema": f"{os.environ.get('DAGS_FOLDER')}/schemas/parat/{table}.json",
+                "table_name": prod_table_name
+            },
+            python_callable=update_table_descriptions
+        )
+        table_backup = BigQueryToBigQueryOperator(
+            task_id=f"back_up_{table}",
+            source_project_dataset_tables=[f"{staging_dataset}.{table}"],
+            destination_project_dataset_table=f"{backups_dataset}.{table}_{curr_date}",
+            create_disposition="CREATE_IF_NEEDED",
+            write_disposition="WRITE_TRUNCATE"
+        )
+        wait_for_checks >> copy_to_production >> pop_descriptions >> table_backup >> wait_for_copy
+
+    # post success to slack
+    msg_success = get_post_success("PARAT tables updated!", dag)
 
     (
         clear_tmp_dir
@@ -312,4 +362,13 @@
         >> load_all_papers
         >> start_visualization_tables
     )
+    (
+        wait_for_visualization_tables
+        >> checks
+        >> wait_for_checks
+    )
+    (
+        wait_for_copy
+        >> msg_success
+    )
 
diff --git a/company_linkage/schemas/paper_visualization_data.json b/company_linkage/schemas/paper_visualization_data.json
new file mode 100644
index 00000000..cc9e5aab
--- /dev/null
+++ b/company_linkage/schemas/paper_visualization_data.json
@@ -0,0 +1,128 @@
+[
+  {
+    "mode": "NULLABLE",
+    "name": "CSET_id",
+    "type": "INTEGER",
+    "description": "CSET id of PARAT company."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "year",
+        "type": "INTEGER",
+        "description": "Year cited."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "citation_count",
+        "type": "INTEGER",
+        "description": "Count of publications in that year that cite AI papers written by the company."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "citation_count_by_year",
+    "type": "RECORD",
+    "description": "Citations of AI papers by the company by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "field_name",
+        "type": "STRING",
+    "description": "Field of study name."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "field_count",
+        "type": "INTEGER",
+    "description": "Count of AI papers by the company where field of study is in their top fields."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "fields",
+    "type": "RECORD",
+    "description": "Fields of study counts (using MAG-style fields of study for AI-relevant fields)."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "cluster_id",
+        "type": "INTEGER",
+        "description": "Map of Science research cluster ID."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "cluster_count",
+        "type": "INTEGER",
+    "description": "Count of how many AI publications from the company appear in that cluster."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "clusters",
+    "type": "RECORD",
+    "description": "Counts of top publications in research clusters."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "ref_CSET_id",
+        "type": "INTEGER",
+    "description": "CSET id of referenced PARAT company."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "referenced_count",
+        "type": "INTEGER",
+    "description": "Count of how many AI publications by that company the primary PARAT company has referenced in their papers."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "company_references",
+    "type": "RECORD",
+    "description": "Counts of publication references to the publications of other companies in the PARAT dataset."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "referent",
+        "type": "STRING",
+    "description": "The task name referent."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "task_count",
+        "type": "INTEGER",
+    "description": "Count of how many AI publications by the company contain this task."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "tasks",
+    "type": "RECORD",
+    "description": "AI task information."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "referent",
+        "type": "STRING",
+    "description": "The method name referent."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "method_count",
+        "type": "INTEGER",
+    "description": "Count of how many AI publications by the company contain this method."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "methods",
+    "type": "RECORD",
+    "description": "AI method information."
+  }
+]
\ No newline at end of file
diff --git a/company_linkage/schemas/patent_visualization_data.json b/company_linkage/schemas/patent_visualization_data.json
new file mode 100644
index 00000000..42496163
--- /dev/null
+++ b/company_linkage/schemas/patent_visualization_data.json
@@ -0,0 +1,950 @@
+[
+  {
+    "mode": "NULLABLE",
+    "name": "CSET_id",
+    "type": "INTEGER",
+    "description": "CSET id of PARAT company."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "name",
+    "type": "STRING",
+    "description": "Name of PARAT company."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "ai_patents",
+    "type": "INTEGER",
+    "description": "Total AI patent families."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Physical_Sciences_and_Engineering_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in physical science and engineering application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Life_Sciences_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in life sciences application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Security__eg_cybersecurity_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in security (e.g. cybersecurity) application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Transportation_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in transportation application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Industrial_and_Manufacturing_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in industrial and manufacturing application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Education_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in education application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Document_Mgt_and_Publishing_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in document management and publishing application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Military_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in military application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Agricultural_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in agricultural application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Computing_in_Government_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in computing in government application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Personal_Devices_and_Computing_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in personal devices and computing application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Banking_and_Finance_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in banking and finance application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Telecommunications_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in telecommunications application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Networks__eg_social_IOT_etc_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in networks (e.g. social, IOT, etc.) application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Business_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in business application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Energy_Management_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in energy management application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Entertainment_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in entertainment application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Nanotechnology_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in nanotechnology application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Semiconductors_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in semiconductors application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Language_Processing_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in language processing functional application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Speech_Processing_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in speech processing functional application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Knowledge_Representation_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in knowledge representation functional application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Planning_and_Scheduling_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in planning and scheduling functional application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Control_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in control functional application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Distributed_AI_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in distributed AI functional application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Robotics_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in robotics functional application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Computer_Vision_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in computer vision functional application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Analytics_and_Algorithms_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in analytics and algorithms functional application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Measuring_and_Testing_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in measuring and testing functional application category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Logic_Programming_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in logic programming AI techniques category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Fuzzy_Logic_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in fuzzy logic AI techniques category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Probabilistic_Reasoning_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in probabilistic reasoning AI techniques category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Ontology_Engineering_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in ontology engineering AI techniques category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Machine_Learning_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in machine learning AI techniques category."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "Search_Methods_pats",
+    "type": "INTEGER",
+    "description": "AI patent families in search methods AI techniques category."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+    "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "ai_patents",
+        "type": "INTEGER",
+    "description": "AI patent families count for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "ai_patents_by_year",
+    "type": "RECORD",
+    "description": "Count of total AI patent families by priority year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Physical_Sciences_and_Engineering_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the physical science and engineering application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Physical_Sciences_and_Engineering_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the physical science and engineering application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Life_Sciences_pats",
+        "type": "INTEGER",
+    "description": "Count of AI patent families in the life sciences application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Life_Sciences_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the life sciences application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Security__eg_cybersecurity_pats",
+        "type": "INTEGER",
+    "description": "Count of AI patent families in the security (e.g. cybersecurity) application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Security__eg_cybersecurity_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the security (e.g. cybersecurity) application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Transportation_pats",
+        "type": "INTEGER",
+    "description": "Count of AI patent families in the transportation application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Transportation_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the transportation application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Industrial_and_Manufacturing_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the industrial and manufacturing application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Industrial_and_Manufacturing_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the industrial and manufacturing application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Education_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the education application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Education_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the education application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Document_Mgt_and_Publishing_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the document management and publishing application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Document_Mgt_and_Publishing_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the document management and publishing application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Military_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the military application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Military_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the military application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Agricultural_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the agricultural application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Agricultural_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the agricultural application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Computing_in_Government_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the computing in government application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Computing_in_Government_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the computing in government application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Personal_Devices_and_Computing_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the personal devices and computing application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Personal_Devices_and_Computing_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the personal devices and computing application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Banking_and_Finance_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the banking and finance application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Banking_and_Finance_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the banking and finance application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Telecommunications_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the telecommunications application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Telecommunications_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the telecommunications application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Networks__eg_social_IOT_etc_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the networks (e.g. social, IOT, etc.) application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Networks__eg_social_IOT_etc_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the networks (e.g. social, IOT, etc.) application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Business_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the business application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Business_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the business application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Energy_Management_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the energy management application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Energy_Management_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the energy management application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Entertainment_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the entertainment application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Entertainment_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the entertainment application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Nanotechnology_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the nanotechnology application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Nanotechnology_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the nanotechnology application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Semiconductors_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the semiconductors application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Semiconductors_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the semiconductors application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Language_Processing_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the language processing functional application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Language_Processing_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the language processing functional application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Speech_Processing_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the speech processing functional application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Speech_Processing_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the speech processing functional application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Knowledge_Representation_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the knowledge representation functional application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Knowledge_Representation_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the knowledge representation functional application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Planning_and_Scheduling_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the planning and scheduling functional application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Planning_and_Scheduling_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the planning and scheduling functional application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Control_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the control functional application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Control_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the control functional application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Distributed_AI_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the distributed AI functional application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Distributed_AI_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the distributed AI functional application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Robotics_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the robotics functional application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Robotics_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the robotics functional application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Computer_Vision_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the computer vision functional application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Computer_Vision_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the computer vision functional application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Analytics_and_Algorithms_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the analytics and algorithms functional application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Analytics_and_Algorithms_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the analytics and engineering functional application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Measuring_and_Testing_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the measuring and testing functional application category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Measuring_and_Testing_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the measuring and testing functional application category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Logic_Programming_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the logic programming AI techniques category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Logic_Programming_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the logic programming AI techniques category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Fuzzy_Logic_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the fuzzy logic AI techniques category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Fuzzy_Logic_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the fuzzy logic AI techniques category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Probabilistic_Reasoning_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the probabilistic reasoning AI techniques category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Probabilistic_Reasoning_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the probabilistic reasoning AI techniques category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Ontology_Engineering_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the ontology engineering AI techniques category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Ontology_Engineering_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the ontology engineering AI techniques category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Machine_Learning_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the machine learning AI techniques category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Machine_Learning_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the machine learning AI techniques category by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "priority_year",
+        "type": "INTEGER",
+        "description": "Priority year of AI patent family."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "Search_Methods_pats",
+        "type": "INTEGER",
+        "description": "Count of AI patent families in the search methods AI techniques category for that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "Search_Methods_pats_by_year",
+    "type": "RECORD",
+    "description": "Count of AI patents in the search methods AI techniques category by year."
+  }
+]
\ No newline at end of file
diff --git a/company_linkage/schemas/visualization_data.json b/company_linkage/schemas/visualization_data.json
new file mode 100644
index 00000000..b34fc089
--- /dev/null
+++ b/company_linkage/schemas/visualization_data.json
@@ -0,0 +1,376 @@
+[
+  {
+    "mode": "NULLABLE",
+    "name": "CSET_id",
+    "type": "INTEGER",
+    "description": "CSET id of PARAT company."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "name",
+    "type": "STRING",
+    "description": "Name of PARAT company."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "country",
+    "type": "STRING",
+    "description": "Country of PARAT company. If company is located in multiple countries, country of headquarters."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "alias_language",
+        "type": "STRING",
+        "description": "Language alias is written in."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "alias",
+        "type": "STRING",
+        "description": "Alias of company."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "aliases",
+    "type": "RECORD",
+    "description": "List of company aliases."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "parent_acquisition",
+        "type": "BOOLEAN",
+        "description": "Boolean indicating whether the company was acquired by its parent company."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "parent_name",
+        "type": "STRING",
+        "description": "Name of parent company."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "parent_id",
+        "type": "INTEGER",
+        "description": "CSET id of parent company."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "parent",
+    "type": "RECORD",
+    "description": "List of parent companies."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "child_name",
+        "type": "STRING",
+        "description": "Name of child company."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "child_id",
+        "type": "INTEGER",
+        "description": "CSET id of child company."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "children",
+    "type": "RECORD",
+    "description": "List of child companies."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "child_name",
+        "type": "STRING",
+        "description": "Name of child company."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "child_id",
+        "type": "INTEGER",
+        "description": "CSET id of child companies."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "non_agg_children",
+    "type": "RECORD",
+    "description": "Name of child company whose data has not been aggregated into the records of the parent company."
+  },
+  {
+    "mode": "REPEATED",
+    "name": "permid",
+    "type": "INTEGER",
+    "description": "Refinitiv Permid."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "website",
+    "type": "STRING",
+    "description": "Company website."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "exchange",
+        "type": "STRING",
+        "description": "Exchange on which the company is listed."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "ticker",
+        "type": "STRING",
+        "description": "Company ticker."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "market",
+    "type": "RECORD",
+    "description": "Company exchange and ticker data."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "crunchbase_uuid",
+        "type": "STRING",
+        "description": "UUID in Crunchbase."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "crunchbase_url",
+        "type": "STRING",
+        "description": "URL on Crunchbase website."
+      }
+    ],
+    "mode": "NULLABLE",
+    "name": "crunchbase",
+    "type": "RECORD",
+    "description": "Crunchbase unique identifier."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "crunchbase_uuid",
+        "type": "STRING",
+        "description": "UUID in Crunchbase."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "crunchbase_url",
+        "type": "STRING",
+        "description": "URL on Crunchbase website."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "child_crunchbase",
+    "type": "RECORD",
+    "description": "Crunchbase unique identifiers for any child companies of the aggregated company."
+  },
+  {
+    "mode": "REPEATED",
+    "name": "ror_id",
+    "type": "STRING",
+    "description": "ROR id for the company."
+  },
+  {
+    "mode": "REPEATED",
+    "name": "linkedin",
+    "type": "STRING",
+    "description": "LinkedIn website for the company."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "in_sandp_500",
+    "type": "BOOLEAN",
+    "description": "Indicator of whether the company was in the S&P 500 at some point during 2020."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "in_fortune_global_500",
+    "type": "BOOLEAN",
+    "description": "Indicator of whether the company was on the 2021 Fortune Global 500 list."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "ai_pubs",
+    "type": "INTEGER",
+    "description": "Count of total AI publications by the company."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "cv_pubs",
+    "type": "INTEGER",
+    "description": "Count of total computer vision publications by the company."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "nlp_pubs",
+    "type": "INTEGER",
+    "description": "Count of total natural language processing publications by the company."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "robotics_pubs",
+    "type": "INTEGER",
+    "description": "Count of total robotics publications by the company."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "year",
+        "type": "INTEGER",
+        "description": "Year published."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "ai_pubs",
+        "type": "INTEGER",
+        "description": "Count of total AI publications by the company in that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "ai_pubs_by_year",
+    "type": "RECORD",
+    "description": "Counts of AI publications by the company by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "year",
+        "type": "INTEGER",
+        "description": "Year published."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "cv_pubs",
+        "type": "INTEGER",
+        "description": "Count of total computer vision publications by the company in that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "cv_pubs_by_year",
+    "type": "RECORD",
+    "description": "Counts of computer vision publications by the company by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "year",
+        "type": "INTEGER",
+        "description": "Year published."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "nlp_pubs",
+        "type": "INTEGER",
+        "description": "Count of total natural language processing publications by the company in that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "nlp_pubs_by_year",
+    "type": "RECORD",
+    "description": "Counts of natural language processing publications by the company by year."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "year",
+        "type": "INTEGER",
+        "description": "Year published."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "robotics_pubs",
+        "type": "INTEGER",
+        "description": "Count of total robotics publications by the company in that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "robotics_pubs_by_year",
+    "type": "RECORD",
+    "description": "Counts of robotics publications by the company by year."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "ai_pubs_in_top_conferences",
+    "type": "INTEGER",
+    "description": "Counts of total AI publications by the company that were published in top AI conferences."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "year",
+        "type": "INTEGER",
+        "description": "Year published."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "ai_pubs_in_top_conferences",
+        "type": "INTEGER",
+        "description": "Count of total AI publications by the company that were published in top AI conferences in that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "ai_pubs_in_top_conferences_by_year",
+    "type": "RECORD",
+    "description": "Counts of AI publications in top conferences by the company by year."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "all_pubs",
+    "type": "INTEGER",
+    "description": "Count of total publications by the company."
+  },
+  {
+    "fields": [
+      {
+        "mode": "NULLABLE",
+        "name": "year",
+        "type": "INTEGER",
+        "description": "Year published."
+      },
+      {
+        "mode": "NULLABLE",
+        "name": "all_pubs",
+        "type": "INTEGER",
+        "description": "Counts of total publications by the company in that year."
+      }
+    ],
+    "mode": "REPEATED",
+    "name": "all_pubs_by_year",
+    "type": "RECORD",
+    "description": "Counts of publications by the company by year."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "short_description",
+    "type": "STRING",
+    "description": "Short description of the company, as drawn from Crunchbase's free interface."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "logo_url",
+    "type": "STRING",
+    "description": "URL linking to a picture of the logo of the company, as drawn from Crunchbase's free interface."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "stage",
+    "type": "STRING",
+    "description": "Maturity stage of a company."
+  }
+]
\ No newline at end of file
diff --git a/company_linkage/schemas/workforce_visualization_data.json b/company_linkage/schemas/workforce_visualization_data.json
new file mode 100644
index 00000000..f90a8022
--- /dev/null
+++ b/company_linkage/schemas/workforce_visualization_data.json
@@ -0,0 +1,20 @@
+[
+  {
+    "mode": "NULLABLE",
+    "name": "cset_id",
+    "type": "INTEGER",
+    "description": "CSET id of PARAT company."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "tt1_jobs",
+    "type": "INTEGER",
+    "description": "AI jobs as defined under CSET's technical track 1 definition."
+  },
+  {
+    "mode": "NULLABLE",
+    "name": "ai_jobs",
+    "type": "INTEGER",
+    "description": "AI jobs as defined under a narrower definition within CSET's technical track 1 definition, focused specifically on research and implementation jobs within technical track 1."
+  }
+]
\ No newline at end of file
diff --git a/company_linkage/sql/initial_paper_visualization_data.sql b/company_linkage/sql/initial_paper_visualization_data.sql
index 57d83813..09c085b2 100644
--- a/company_linkage/sql/initial_paper_visualization_data.sql
+++ b/company_linkage/sql/initial_paper_visualization_data.sql
@@ -33,8 +33,9 @@ WITH
     add_year
   GROUP BY
     CSET_id,
-    year)
-SELECT
+    year),
+all_cited as
+(SELECT
   CSET_id,
   ARRAY_AGG(STRUCT(year,
       citation_count)
@@ -43,6 +44,15 @@ SELECT
 FROM
   by_year
 GROUP BY
-  CSET_id
+  CSET_id)
+SELECT
+  CSET_id,
+  citation_count_by_year
+FROM
+  high_resolution_entities.aggregated_organizations
+LEFT JOIN
+  all_cited
+USING
+  (CSET_id)
 ORDER BY
   CSET_id
\ No newline at end of file
diff --git a/company_linkage/sql/initial_workforce_visualization_data.sql b/company_linkage/sql/initial_workforce_visualization_data.sql
index 1c0f9be9..f6cdbec3 100644
--- a/company_linkage/sql/initial_workforce_visualization_data.sql
+++ b/company_linkage/sql/initial_workforce_visualization_data.sql
@@ -7,8 +7,9 @@ WITH
   FROM
     high_resolution_entities.aggregated_organizations
   CROSS JOIN
-    UNNEST (linkedin) AS linkedins)
-SELECT
+    UNNEST (linkedin) AS linkedins),
+job_info as
+(SELECT
   DISTINCT cset_id,
   COUNT(DISTINCT user_id) AS tt1_jobs
 FROM
@@ -41,6 +42,16 @@ WHERE
     OR ((degree = "Doctor")
       AND REGEXP_CONTAINS(field_raw, r'(?i)(computer\s+science|computer\s+engineering|electrical\s+engineering)')))
 GROUP BY
-  cset_id
+  cset_id)
+SELECT
+  DISTINCT
+  cset_id,
+  COALESCE(tt1_jobs, 0) as tt1_jobs
+FROM
+  high_resolution_entities.aggregated_organizations
+LEFT JOIN
+  job_info
+USING
+  (cset_id)
 ORDER BY
   cset_id
\ No newline at end of file
diff --git a/company_linkage/sql/paper_visualization_data.sql b/company_linkage/sql/paper_visualization_data.sql
index 122b58a8..3f24c27a 100644
--- a/company_linkage/sql/paper_visualization_data.sql
+++ b/company_linkage/sql/paper_visualization_data.sql
@@ -4,11 +4,11 @@ WITH
   SELECT
     CSET_id
   FROM
-    staging_ai_companies_visualization.visualization_data_omit_by_year
+    staging_ai_companies_visualization.visualization_data_omit_by_rule
   RIGHT JOIN
     staging_ai_companies_visualization.paper_visualization_data_with_methods
   USING (cset_id)
-  WHERE visualization_data_omit_by_year.cset_id IS NULL)
+  WHERE visualization_data_omit_by_rule.cset_id IS NULL)
 SELECT
   *
 FROM
diff --git a/company_linkage/sql/paper_visualization_data_with_methods.sql b/company_linkage/sql/paper_visualization_data_with_methods.sql
index 991a122e..e561d138 100644
--- a/company_linkage/sql/paper_visualization_data_with_methods.sql
+++ b/company_linkage/sql/paper_visualization_data_with_methods.sql
@@ -13,7 +13,7 @@ WITH
     merged_id,
     referent
   FROM
-    ai_companies_visualization.ai_company_pubs
+    staging_ai_companies_visualization.ai_company_papers
   LEFT JOIN
     articles_with_ai_methods
   USING
@@ -45,7 +45,7 @@ SELECT
   paper_visualization_data_with_tasks.*,
   methods
 FROM
-  ai_companies_visualization.paper_visualization_data_with_tasks
+  staging_ai_companies_visualization.paper_visualization_data_with_tasks
 LEFT JOIN
   aggregated_fields
 USING
diff --git a/company_linkage/sql/paper_visualization_data_with_tasks.sql b/company_linkage/sql/paper_visualization_data_with_tasks.sql
index b2c6b157..eed2f588 100644
--- a/company_linkage/sql/paper_visualization_data_with_tasks.sql
+++ b/company_linkage/sql/paper_visualization_data_with_tasks.sql
@@ -13,7 +13,7 @@ WITH
     merged_id,
     referent
   FROM
-    ai_companies_visualization.ai_company_pubs
+    staging_ai_companies_visualization.ai_company_papers
   LEFT JOIN
     articles_with_ai_tasks
   USING
@@ -45,7 +45,7 @@ SELECT
   paper_visualization_data_with_company_references.*,
   tasks
 FROM
-  ai_companies_visualization.paper_visualization_data_with_company_references
+  staging_ai_companies_visualization.paper_visualization_data_with_company_references
 LEFT JOIN
   aggregated_fields
 USING
diff --git a/company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql b/company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql
index 3f0c174c..a6981174 100644
--- a/company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql
+++ b/company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql
@@ -1,10 +1,9 @@
-create or replace table ai_companies_visualization.workforce_visualization_data as
 WITH
   clean_linkedins AS (
   SELECT
     DISTINCT cset_id,
     name,
-    REPLACE(linkedins, "https://www.", "http://") AS linkedin
+    REPLACE(REPLACE(linkedins, "https://www.", ""), "http://www.", "") AS linkedin
   FROM
     high_resolution_entities.aggregated_organizations
   CROSS JOIN
@@ -12,15 +11,15 @@ WITH
   new_ai_jobs AS (
   SELECT
     DISTINCT cset_id,
-    COUNT(DISTINCT user_id) AS ai_jobs
+    COUNT(DISTINCT individual_position.user_id) AS ai_jobs
   FROM
     clean_linkedins
   INNER JOIN
-    `gcp-cset-projects.gcp_cset_revelio.position` position
+    revelio.individual_position
   ON
-    linkedin = company_li_url
+    linkedin = company_linkedin_url
   INNER JOIN
-    gcp_cset_revelio.role_lookup
+    revelio.role_lookup
   USING
     (mapped_role)
   INNER JOIN
@@ -28,12 +27,16 @@ WITH
   ON
     (k1000 = role_k1000)
   LEFT JOIN
-    gcp_cset_revelio.education
+    revelio.individual_education
   USING
     (user_id)
+  LEFT JOIN
+    revelio.individual_position_descriptions
+  USING
+    (position_id)
   WHERE
-    (position.enddate IS NULL
-      OR position.enddate > CURRENT_DATE())
+    (individual_position.enddate IS NULL
+      OR individual_position.enddate > CURRENT_DATE())
     AND (ba_req IS FALSE
       OR ((degree = "Bachelor"
           OR degree = "Master"
@@ -52,7 +55,7 @@ SELECT
   tt1_jobs,
   COALESCE(ai_jobs, 0) as ai_jobs
 FROM
-  ai_companies_visualization.workforce_visualization_data
+  staging_ai_companies_visualization.initial_workforce_visualization_data
 LEFT JOIN
   new_ai_jobs
 USING

From 38cedbb6664a4a84e73cb391ce1a2b355f8fb362 Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Fri, 12 Jan 2024 10:20:54 -0500
Subject: [PATCH 16/17] Clean things up, remove imports, edit node pools, and
 fix comments

---
 company_linkage/parat_data_dag.py               | 17 +++++------------
 .../sql/paper_visualization_data.sql            |  5 +++++
 .../sql/patent_visualization_data.sql           |  5 +++++
 .../sql/workforce_visualization_data.sql        |  5 +++++
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py
index c3d3d3e5..4cfcd805 100644
--- a/company_linkage/parat_data_dag.py
+++ b/company_linkage/parat_data_dag.py
@@ -3,19 +3,12 @@
 
 from airflow import DAG
 from airflow.operators.python import PythonOperator
-from airflow.operators.trigger_dagrun import TriggerDagRunOperator
 from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator, BigQueryCheckOperator
-from airflow.providers.google.cloud.operators.cloud_sql import (
-    CloudSQLImportInstanceOperator,
-)
 from airflow.providers.google.cloud.transfers.bigquery_to_bigquery import BigQueryToBigQueryOperator
 from airflow.providers.google.cloud.operators.kubernetes_engine import GKEStartPodOperator
 from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator
 from airflow.operators.dummy import DummyOperator
 from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator
-from airflow.providers.google.cloud.transfers.bigquery_to_gcs import (
-    BigQueryToGCSOperator,
-)
 from dataloader.airflow_utils.defaults import (
     DATA_BUCKET,
     PROJECT_ID,
@@ -72,7 +65,6 @@
     join_tables = []
     for table in ["alias", "grid", "ids", "linkedin", "market", "organizations", "parent", "permid"]:
 
-        # Grab all the data and write it to unseen_en_corpus
         join_table = BigQueryInsertJobOperator(
             task_id=f"join_{table}",
             configuration={
@@ -105,12 +97,13 @@
     curr = start_initial_tables
     for line in open(seq_path_prefix + initial_query_sequence).readlines():
         dataset, table = line.split(",")
-        table_name = f"{dataset}.{table.strip()}"
+        table = table.strip()
+        table_name = f"{dataset}.{table}"
         next_tab = BigQueryInsertJobOperator(
             task_id=f"create_{table_name}",
             configuration={
                 "query": {
-                    "query": "{% include '" + f"{sql_dir}/{table.strip()}.sql" + "' %}",
+                    "query": "{% include '" + f"{sql_dir}/{table}.sql" + "' %}",
                     "useLegacySql": False,
                     "destinationTable": {
                         "projectId": PROJECT_ID,
@@ -155,7 +148,7 @@
         task_id="run_get_ai_counts",
         project_id=PROJECT_ID,
         location=GCP_ZONE,
-        cluster_name="us-east1-production2023-cc1-01d75926-gke",
+        cluster_name="cc2-task-pool",
         name="run_get_ai_counts",
         cmds=["/bin/bash"],
         arguments=["-c", (f"echo 'getting AI counts!' ; rm -r ai || true ; "
@@ -213,7 +206,7 @@
             task_id=f"run_get_{paper_type}_counts",
             project_id=PROJECT_ID,
             location=GCP_ZONE,
-            cluster_name="us-east1-production2023-cc1-01d75926-gke",
+            cluster_name="cc2-task-pool",
             name=f"run_get_{paper_type}_counts",
             cmds=["/bin/bash"],
             arguments=["-c", (f"echo 'getting {paper_type} paper counts!' ; rm -r {paper_type} || true ; "
diff --git a/company_linkage/sql/paper_visualization_data.sql b/company_linkage/sql/paper_visualization_data.sql
index 3f24c27a..d71afe2e 100644
--- a/company_linkage/sql/paper_visualization_data.sql
+++ b/company_linkage/sql/paper_visualization_data.sql
@@ -1,4 +1,9 @@
   -- Selecting the companies we want to leave out
+  -- Essentially, visualization_data_omit_by_rule contains all the companies that we want
+  -- to retain after the omit_by_rule process has been applied
+  -- So, here, in to_omit, we select any company that isn't found in that table as a
+  -- company we'd like to omit, replicating the rule-based omission.
+  -- This allows us to omit the same set of companies across all of our tables.
 WITH
   to_omit AS (
   SELECT
diff --git a/company_linkage/sql/patent_visualization_data.sql b/company_linkage/sql/patent_visualization_data.sql
index 29818a23..8047506a 100644
--- a/company_linkage/sql/patent_visualization_data.sql
+++ b/company_linkage/sql/patent_visualization_data.sql
@@ -1,4 +1,9 @@
   -- Selecting the companies we want to leave out
+  -- Essentially, visualization_data_omit_by_rule contains all the companies that we want
+  -- to retain after the omit_by_rule process has been applied
+  -- So, here, in to_omit, we select any company that isn't found in that table as a
+  -- company we'd like to omit, replicating the rule-based omission.
+  -- This allows us to omit the same set of companies across all of our tables.
 WITH
   to_omit AS (
   SELECT
diff --git a/company_linkage/sql/workforce_visualization_data.sql b/company_linkage/sql/workforce_visualization_data.sql
index 3779e5e3..e5cc4f8e 100644
--- a/company_linkage/sql/workforce_visualization_data.sql
+++ b/company_linkage/sql/workforce_visualization_data.sql
@@ -1,4 +1,9 @@
   -- Selecting the companies we want to leave out
+  -- Essentially, visualization_data_omit_by_rule contains all the companies that we want
+  -- to retain after the omit_by_rule process has been applied
+  -- So, here, in to_omit, we select any company that isn't found in that table as a
+  -- company we'd like to omit, replicating the rule-based omission.
+  -- This allows us to omit the same set of companies across all of our tables.
 WITH
   to_omit AS (
   SELECT

From a6c2d223a46fde55cdbbf0d5c493eda0d6a535d4 Mon Sep 17 00:00:00 2001
From: Rebecca <rggelles@gmail.com>
Date: Wed, 17 Jan 2024 12:17:06 -0500
Subject: [PATCH 17/17] Fix ai papers table to use merged id countries not ror
 countries

---
 company_linkage/sql/ai_publications.sql | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/company_linkage/sql/ai_publications.sql b/company_linkage/sql/ai_publications.sql
index 0a000a1d..4c87164a 100644
--- a/company_linkage/sql/ai_publications.sql
+++ b/company_linkage/sql/ai_publications.sql
@@ -13,10 +13,13 @@ WITH
     -- Adding in org names and country data using ROR
   SELECT
     id,
-    name AS org_name,
-    country.country_name AS country
+    ror.name AS org_name,
+    standard_name AS country
   FROM
-    gcp_cset_ror.ror),
+    gcp_cset_ror.ror
+    LEFT JOIN
+    countries.country_code
+    ON lower(country.country_code) = lower(country_code.raw_alpha_2)),
   merged_rors AS (
     -- Selecting all the merged ids and ror ids from the literature table
   SELECT