diff --git a/company_linkage/Dockerfile b/company_linkage/Dockerfile new file mode 100644 index 00000000..fc5ed907 --- /dev/null +++ b/company_linkage/Dockerfile @@ -0,0 +1,22 @@ +FROM ubuntu:20.04 + +# Set up system dependencies +RUN apt -y update +RUN apt-get -y update +RUN apt-get install -y build-essential libssl-dev libffi-dev python3-dev python3-pip curl + +# Grab files we need to run +ADD requirements.txt /parat/requirements.txt +ADD parat_scripts/* /parat/ + +# install gsutil and put it on the path for airflow to use +ENV CLOUDSDK_INSTALL_DIR /usr/local/gcloud/ +RUN curl -sSL https://sdk.cloud.google.com | bash +ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin + +# Install python dependencies +WORKDIR /parat +ENV AIRFLOW_GPL_UNIDECODE=yes +RUN pip3 install -r requirements.txt +# Make sure the above config succeeded +RUN python3 -m pytest test_aggregate_organizations.py -k test_add_location \ No newline at end of file diff --git a/company_linkage/README.md b/company_linkage/README.md index 9dcdb1b3..6a585284 100644 --- a/company_linkage/README.md +++ b/company_linkage/README.md @@ -16,37 +16,47 @@ run some of this code as-is. ## Tasks to build visualization data -1. [creating_organizations_from_airtable_imports.sql](sql/create_organizations_from_airtable_imports.sql) -2. [selecting_ai_publications.sql](sql/selecting_ai_publications.sql) -3. `python3 aggregate_organizations.py aggregated_organizations.jsonl` -4. Replace `high_resolution_entities.aggregated_organizations` with the data from `aggregated_organizations.jsonl` using the [aggregated_organizations_schema](schemas/aggregated_organizations_schema.json) -5. [selecting_ai_patents.sql](sql/selecting_ai_patents.sql) -6. `python3 get_ai_counts.py data/ai_company_papers.jsonl data/ai_company_patents.jsonl` -7. Upload `ai_company_papers.jsonl` to `ai_companies_visualization.ai_company_pubs` using the [ai_papers_schema](schemas/ai_papers_schema.json) -8. Upload `ai_company_patents.jsonl` to `ai_companies_visualization.ai_company_patents` using the [ai_patents_schema](schemas/ai_patents_schema.json) -9. [creating_initial_visualization_data_publications.sql](sql/creating_initial_visualization_data_publications.sql) -10. [adding_ai_pubs_by_year_to_visualization.sql](sql/adding_ai_pubs_by_year_to_visualization.sql) -11. [creating_patent_visualization_data.sql](sql/creating_patent_visualization_data.sql) -12. [adding_ai_patents_by_year_to_visualization.sql](sql/adding_ai_patents_by_year_to_visualization.sql) -13. [creating_paper_visualization_data.sql](sql/creating_paper_visualization_data.sql) -14. [adding_top_mag_ai_fields.sql](sql/adding_top_mag_ai_fields.sql) -15. [adding_top_science_map_clusters.sql](sql/adding_top_science_map_clusters.sql) -16. [adding_company_references.sql](sql/adding_company_references.sql) -17. [adding_top_tasks.sql](sql/adding_top_tasks.sql) -18. [adding_top_methods.sql](sql/adding_top_methods.sql) -19. [selecting_top_conference_pubs.sql](sql/selecting_top_conference_pubs.sql) -20. [pulling_publications_in_top_ai_conferences.sql](sql/pulling_publications_in_top_ai_conferences.sql) -21. `python3 top_papers.py top_paper_counts.jsonl` -22. Upload `top_paper_counts.jsonl` to `ai_companies_visualization.top_paper_counts` using the [top_papers_schema](schemas/top_papers_schema.json) -23. [adding_top_paper_counts.sql](sql/adding_top_paper_counts.sql) -24. [selecting_all_publications.sql](sql/selecting_all_publications.sql) -25. `python3 all_papers.py all_paper_counts.jsonl` -26. Upload `all_paper_counts.jsonl` to `ai_companies_visualization.total_paper_counts` using the [all_papers_schema](schemas/all_papers_schema.json) -27. [adding_all_paper_counts.sql](sql/adding_all_paper_counts.sql) -28. [creating_workforce_visualization_data.sql](sql/creating_workforce_visualization_data.sql) -29. [adding_ai_jobs_to_workforce_visualization.sql](sql/adding_ai_jobs_to_workforce_visualization.sql) -31. [omit_by_rule.sql](sql/omit_by_rule.sql) -32. [omit_by_rule_papers.sql](sql/omit_by_rule_papers.sql) -33. [omit_by_rule_patents.sql](sql/omit_by_rule_patents.sql) -34. [omit_by_rule_workforce.sql](sql/omit_by_rule_workforce.sql) -35. [adding_crunchbase_company_metadata.sql](sql/adding_crunchbase_company_metadata.sql) \ No newline at end of file +1. [organizations.sql](sql/organizations.sql) +2. [ai_publications.sql](sql/ai_publications.sql) +3. [linked_ai_patents.sql](sql/linked_ai_patents.sql) +4. [top_conference_pubs.sql](sql/top_conference_pubs.sql) +5. [pubs_in_top_conferences.sql](sql/pubs_in_top_conferences.sql) +6. [all_publications.sql](sql/all_publications.sql) +7. `python3 aggregate_organizations.py aggregated_organizations.jsonl` +8. Replace `high_resolution_entities.aggregated_organizations` with the data from `aggregated_organizations.jsonl` using the [aggregated_organizations_schema](schemas/aggregated_organizations_schema.json) +9. `python3 get_ai_counts.py data/ai_company_papers.jsonl data/ai_company_patents.jsonl` +10. Upload `ai_company_papers.jsonl` to `ai_companies_visualization.ai_company_pubs` using the [ai_papers_schema](schemas/ai_papers_schema.json) +11. Upload `ai_company_patents.jsonl` to `ai_companies_visualization.ai_company_patents` using the [ai_patents_schema](schemas/ai_patents_schema.json) +12. `python3 top_papers.py top_paper_counts.jsonl` +13. Upload `top_paper_counts.jsonl` to `ai_companies_visualization.top_paper_counts` using the [top_papers_schema](schemas/top_papers_schema.json) +14. `python3 all_papers.py all_paper_counts.jsonl` +15. Upload `all_paper_counts.jsonl` to `ai_companies_visualization.total_paper_counts` using the [all_papers_schema](schemas/all_papers_schema.json) +16. [initial_visualization_data.sql](sql/initial_visualization_data.sql) +17. [visualization_data_with_by_year.sql](sql/visualization_data_with_by_year.sql) +18. [visualization_data_with_top_papers.sql](sql/visualization_data_with_top_papers.sql) +19. [visualization_data_with_all_papers.sql](sql/visualization_data_with_all_papers.sql) +20. [initial_patent_visualization_data.sql](sql/initial_patent_visualization_data.sql) +21. [patent_visualization_data_with_by_year.sql](sql/patent_visualization_data_with_by_year.sql) +22. [initial_paper_visualization_data.sql](sql/initial_paper_visualization_data.sql) +23. [paper_visualization_data_with_mag.sql](sql/paper_visualization_data_with_mag.sql) +24. [paper_visualization_data_with_clusters.sql](sql/paper_visualization_data_with_clusters.sql) +25. [paper_visualization_data_with_company_references.sql](sql/paper_visualization_data_with_company_references.sql) +26. [paper_visualization_data_with_tasks.sql](sql/paper_visualization_data_with_tasks.sql) +27. [paper_visualization_data_with_methods.sql](sql/paper_visualization_data_with_methods.sql) +28. [initial_workforce_visualization_data.sql](sql/initial_workforce_visualization_data.sql) +29. [workforce_visualization_data_with_ai_jobs.sql](sql/workforce_visualization_data_with_ai_jobs.sql) +30. [visualization_data_omit_by_rule.sql](sql/visualization_data_omit_by_rule.sql) +31. [visualization_data.sql](sql/visualization_data.sql) +32. [patent_visualization_data.sql](sql/patent_visualization_data.sql) +33. [paper_visualization_data.sql](sql/paper_visualization_data.sql) +34. [workforce_visualization_data.sql](sql/workforce_visualization_data.sql) + +# Deployment + +To refresh the docker container (which you must do if you change any of the python scripts in parat_scripts/), run + +``` +docker build -t parat . +docker tag parat us.gcr.io/gcp-cset-projects/parat +docker push us.gcr.io/gcp-cset-projects/parat +``` \ No newline at end of file diff --git a/company_linkage/data/omit.csv b/company_linkage/data/omit.csv deleted file mode 100644 index 5e432fc3..00000000 --- a/company_linkage/data/omit.csv +++ /dev/null @@ -1,40 +0,0 @@ -CSET_id -100 -296 -346 -374 -380 -386 -412 -418 -464 -467 -495 -612 -628 -633 -649 -724 -728 -756 -767 -2287 -2774 -2778 -2784 -2789 -2806 -2815 -2831 -2850 -2851 -2855 -2875 -2922 -2956 -2976 -2977 -2981 -2987 -3036 -3058 \ No newline at end of file diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py new file mode 100644 index 00000000..4cfcd805 --- /dev/null +++ b/company_linkage/parat_data_dag.py @@ -0,0 +1,367 @@ +import os +from datetime import datetime + +from airflow import DAG +from airflow.operators.python import PythonOperator +from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator, BigQueryCheckOperator +from airflow.providers.google.cloud.transfers.bigquery_to_bigquery import BigQueryToBigQueryOperator +from airflow.providers.google.cloud.operators.kubernetes_engine import GKEStartPodOperator +from airflow.providers.google.cloud.transfers.gcs_to_bigquery import GCSToBigQueryOperator +from airflow.operators.dummy import DummyOperator +from airflow.providers.google.cloud.operators.gcs import GCSDeleteObjectsOperator +from dataloader.airflow_utils.defaults import ( + DATA_BUCKET, + PROJECT_ID, + GCP_ZONE, + DAGS_DIR, + get_default_args, + get_post_success, +) +from dataloader.scripts.populate_documentation import update_table_descriptions + +from parat_scripts.aggregate_organizations import aggregate_organizations + +bucket = DATA_BUCKET +initial_dataset = "parat_input" +intermediate_dataset = "high_resolution_entities" +production_dataset = "ai_companies_visualization" +staging_dataset = f"staging_{production_dataset}" +backups_dataset = f"{production_dataset}_backups" +sql_dir = "sql/parat" +schema_dir = "parat/schemas" +tmp_dir = f"{production_dataset}/tmp" + +default_args = get_default_args() +date = datetime.now().strftime("%Y%m%d") + + +# Part 2: Get data from airtable and update databases +dag = DAG( + "parat", + default_args=default_args, + description="PARAT data updater", + schedule_interval=None, + catchup=False, + user_defined_macros={ + "staging_dataset": staging_dataset, + "production_dataset": production_dataset, + "intermediate_dataset": intermediate_dataset, + "initial_dataset": initial_dataset, + "backups_dataset": backups_dataset, + }, +) +with dag: + + clear_tmp_dir = GCSDeleteObjectsOperator( + task_id="clear_tmp_dir", + bucket_name=DATA_BUCKET, + prefix=tmp_dir + ) + + # combine all the airtable tables into joined tables + + start = DummyOperator(task_id="starting") + + join_tables = [] + for table in ["alias", "grid", "ids", "linkedin", "market", "organizations", "parent", "permid"]: + + join_table = BigQueryInsertJobOperator( + task_id=f"join_{table}", + configuration={ + "query": { + "query": f"select distinct * from {initial_dataset}.{table}_preannotation UNION DISTINCT " + f"select distinct * from {initial_dataset}.{table}_validate", + "useLegacySql": False, + "destinationTable": { + "projectId": PROJECT_ID, + "datasetId": initial_dataset, + "tableId": f"{table}_joined" + }, + "allowLargeResults": True, + "createDisposition": "CREATE_IF_NEEDED", + "writeDisposition": "WRITE_TRUNCATE" + } + } + ) + join_tables.append(join_table) + + # Do initial query sequence + + start_initial_tables = DummyOperator(task_id="start_initial_tables") + + wait_for_initial_tables = DummyOperator(task_id="wait_for_initial_tables") + + seq_path_prefix = f"{os.environ.get('DAGS_FOLDER')}/sequences/parat/" + initial_query_sequence = "initial_data.csv" + + curr = start_initial_tables + for line in open(seq_path_prefix + initial_query_sequence).readlines(): + dataset, table = line.split(",") + table = table.strip() + table_name = f"{dataset}.{table}" + next_tab = BigQueryInsertJobOperator( + task_id=f"create_{table_name}", + configuration={ + "query": { + "query": "{% include '" + f"{sql_dir}/{table}.sql" + "' %}", + "useLegacySql": False, + "destinationTable": { + "projectId": PROJECT_ID, + "datasetId": dataset, + "tableId": table + }, + "allowLargeResults": True, + "createDisposition": "CREATE_IF_NEEDED", + "writeDisposition": "WRITE_TRUNCATE" + } + }, + ) + curr >> next_tab + curr = next_tab + curr >> wait_for_initial_tables + + # run aggregate_organizations python and load to GCS + aggregated_table = "aggregated_organizations" + + aggregate_organizations = PythonOperator( + task_id="aggregate_organizations", + op_kwargs={ + "output_file": f"{aggregated_table}.jsonl" + }, + python_callable=aggregate_organizations, + ) + + # load aggregated_organizations to BigQuery + + load_aggregated_orgs = GCSToBigQueryOperator( + task_id=f"load_{aggregated_table}", + bucket=DATA_BUCKET, + source_objects=[f"{tmp_dir}/{aggregated_table}.jsonl"], + schema_object=f"{schema_dir}/{aggregated_table}.json", + destination_project_dataset_table=f"{intermediate_dataset}.{aggregated_table}", + source_format="NEWLINE_DELIMITED_JSON", + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + + run_get_ai_counts = GKEStartPodOperator( + task_id="run_get_ai_counts", + project_id=PROJECT_ID, + location=GCP_ZONE, + cluster_name="cc2-task-pool", + name="run_get_ai_counts", + cmds=["/bin/bash"], + arguments=["-c", (f"echo 'getting AI counts!' ; rm -r ai || true ; " + f"mkdir -p ai && " + f"python3 get_ai_counts.py ai/ai_company_papers.jsonl ai/ai_company_patents.jsonl && " + f"gsutil -m cp -r ai gs://{DATA_BUCKET}/{tmp_dir}/ ")], + namespace="default", + image=f"us.gcr.io/{PROJECT_ID}/parat", + get_logs=True, + startup_timeout_seconds=300, + # see also https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator#affinity-config + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [{ + "matchExpressions": [{ + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": [ + "default-pool", + ] + }] + }] + } + } + } + ) + + load_ai_papers = GCSToBigQueryOperator( + task_id=f"load_ai_company_papers", + bucket=DATA_BUCKET, + source_objects=[f"{tmp_dir}/ai/ai_company_papers.jsonl"], + schema_object=f"{schema_dir}/ai_papers_schema.json", + destination_project_dataset_table=f"{staging_dataset}.ai_company_papers", + source_format="NEWLINE_DELIMITED_JSON", + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + + load_ai_patents = GCSToBigQueryOperator( + task_id=f"load_ai_company_patents", + bucket=DATA_BUCKET, + source_objects=[f"{tmp_dir}/ai/ai_company_patents.jsonl"], + schema_object=f"{schema_dir}/ai_patents_schema.json", + destination_project_dataset_table=f"{staging_dataset}.ai_company_patents", + source_format="NEWLINE_DELIMITED_JSON", + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + + run_papers = [] + for paper_type in ["top", "all"]: + + run_get_paper_counts = GKEStartPodOperator( + task_id=f"run_get_{paper_type}_counts", + project_id=PROJECT_ID, + location=GCP_ZONE, + cluster_name="cc2-task-pool", + name=f"run_get_{paper_type}_counts", + cmds=["/bin/bash"], + arguments=["-c", (f"echo 'getting {paper_type} paper counts!' ; rm -r {paper_type} || true ; " + f"mkdir -p {paper_type} && " + f"python3 {paper_type}_papers.py {paper_type}/{paper_type}_paper_counts.jsonl && " + f"gsutil -m cp -r {paper_type} gs://{DATA_BUCKET}/{tmp_dir}/ ")], + namespace="default", + image=f"us.gcr.io/{PROJECT_ID}/parat", + get_logs=True, + startup_timeout_seconds=300, + # see also https://cloud.google.com/composer/docs/how-to/using/using-kubernetes-pod-operator#affinity-config + affinity={ + "nodeAffinity": { + "requiredDuringSchedulingIgnoredDuringExecution": { + "nodeSelectorTerms": [{ + "matchExpressions": [{ + "key": "cloud.google.com/gke-nodepool", + "operator": "In", + "values": [ + "default-pool", + ] + }] + }] + } + } + } + ) + run_papers.append(run_get_paper_counts) + + # even though these are near-identical we do these in sequence -- we'd have to put in a dummy operator + # otherwise anyway and they should be fast + + load_top_papers = GCSToBigQueryOperator( + task_id=f"load_top_papers", + bucket=DATA_BUCKET, + source_objects=[f"{tmp_dir}/top/top_paper_counts.jsonl"], + schema_object=f"{schema_dir}/top_papers_schema.json", + destination_project_dataset_table=f"{staging_dataset}.top_paper_counts", + source_format="NEWLINE_DELIMITED_JSON", + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + + load_all_papers = GCSToBigQueryOperator( + task_id=f"load_all_papers", + bucket=DATA_BUCKET, + source_objects=[f"{tmp_dir}/all/all_paper_counts.jsonl"], + schema_object=f"{schema_dir}/all_papers_schema.json", + destination_project_dataset_table=f"{staging_dataset}.all_paper_counts", + source_format="NEWLINE_DELIMITED_JSON", + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + + start_visualization_tables = DummyOperator(task_id="start_visualization_tables") + wait_for_visualization_tables = DummyOperator(task_id="wait_for_visualization_tables") + + visualization_query_sequence = "visualization_data.csv" + + curr = start_visualization_tables + for line in open(seq_path_prefix + visualization_query_sequence).readlines(): + dataset, table = line.split(",") + table_name = f"{dataset}.{table.strip()}" + next_tab = BigQueryInsertJobOperator( + task_id=f"create_{table_name}", + configuration={ + "query": { + "query": "{% include '" + f"{sql_dir}/{table.strip()}.sql" + "' %}", + "useLegacySql": False, + "destinationTable": { + "projectId": PROJECT_ID, + "datasetId": dataset, + "tableId": table + }, + "allowLargeResults": True, + "createDisposition": "CREATE_IF_NEEDED", + "writeDisposition": "WRITE_TRUNCATE" + } + }, + ) + curr >> next_tab + curr = next_tab + curr >> wait_for_visualization_tables + + checks = [] + for query in os.listdir(f"{DAGS_DIR}/{sql_dir}"): + if not query.startswith("check_"): + continue + checks.append(BigQueryCheckOperator( + task_id=query.replace(".sql", ""), + sql=f"{sql_dir}/{query}", + use_legacy_sql=False + )) + + wait_for_checks = DummyOperator(task_id="wait_for_checks") + + wait_for_copy = DummyOperator(task_id="wait_for_copy") + + curr_date = datetime.now().strftime('%Y%m%d') + prod_tables = ["visualization_data", "paper_visualization_data", + "patent_visualization_data", "workforce_visualization_data"] + for table in prod_tables: + prod_table_name = f"{production_dataset}.{table}" + copy_to_production = BigQueryToBigQueryOperator( + task_id="copy_" + table + "_to_production", + source_project_dataset_tables=[staging_dataset + "." + table], + destination_project_dataset_table=prod_table_name, + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + pop_descriptions = PythonOperator( + task_id="populate_column_documentation_for_" + table, + op_kwargs={ + "input_schema": f"{os.environ.get('DAGS_FOLDER')}/schemas/parat/{table}.json", + "table_name": prod_table_name + }, + python_callable=update_table_descriptions + ) + table_backup = BigQueryToBigQueryOperator( + task_id=f"back_up_{table}", + source_project_dataset_tables=[f"{staging_dataset}.{table}"], + destination_project_dataset_table=f"{backups_dataset}.{table}_{curr_date}", + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + wait_for_checks >> copy_to_production >> pop_descriptions >> table_backup >> wait_for_copy + + # post success to slack + msg_success = get_post_success("PARAT tables updated!", dag) + + ( + clear_tmp_dir + >> start + >> join_tables + >> start_initial_tables + ) + ( + wait_for_initial_tables + >> aggregate_organizations + >> load_aggregated_orgs + >> run_get_ai_counts + >> load_ai_papers + >> load_ai_patents + >> run_papers + >> load_top_papers + >> load_all_papers + >> start_visualization_tables + ) + ( + wait_for_visualization_tables + >> checks + >> wait_for_checks + ) + ( + wait_for_copy + >> msg_success + ) + diff --git a/company_linkage/aggregate_organizations.py b/company_linkage/parat_scripts/aggregate_organizations.py similarity index 96% rename from company_linkage/aggregate_organizations.py rename to company_linkage/parat_scripts/aggregate_organizations.py index 0029ed62..43dcbddc 100644 --- a/company_linkage/aggregate_organizations.py +++ b/company_linkage/parat_scripts/aggregate_organizations.py @@ -2,6 +2,7 @@ from google.cloud import bigquery import json from collections import defaultdict +import subprocess # List of companies not being aggregated # note: check https://docs.google.com/spreadsheets/d/1Tq28O8qIA6T3AJ5oTHKCcscaNZsY_E4OPOUm6JaiwWA/edit#gid=0 @@ -22,7 +23,7 @@ def __init__(self, cset_id, name): self.market = [] self.crunchbase = {} self.child_crunchbase = [] - self.grid = [] + self.ror = [] self.regex = [] self.bgov_id = [] self.comment = None @@ -129,14 +130,14 @@ def add_child_crunchbase(self, uuid, url): if crunchbase not in self.child_crunchbase and crunchbase != self.crunchbase: self.child_crunchbase.append(crunchbase) - def add_grid(self, grid): + def add_ror(self, ror): """ - Adding GRID (from grid.ac) for aggregation - :param grid: grid value + Adding ROR for aggregation + :param ror: ror value :return: """ - if grid and grid not in self.grid: - self.grid.append(grid) + if ror and ror not in self.ror: + self.ror.append(ror) def add_regex(self, regex): """ @@ -368,8 +369,8 @@ def update_organization_identifiers(self, org, org_id): org_info.add_child_crunchbase(org["crunchbase"]["crunchbase_uuid"], org["crunchbase"]["crunchbase_url"]) else: org_info.add_crunchbase(org["crunchbase"]["crunchbase_uuid"], org["crunchbase"]["crunchbase_url"]) - for grid in org["grid"]: - org_info.add_grid(grid) + for ror in org["ror_id"]: + org_info.add_ror(ror) org_info.add_regex(org["regex"]) org_info.add_linkedin(org["linkedin"]) org_info.add_bgov_id(org["BGOV_id"]) @@ -394,7 +395,7 @@ def update_organization_data(self, org, org_id): org_info.add_sandp(org["in_sandp_500"]) org_info.add_fortune(org["in_fortune_global_500"]) - def print_output(self, output_file): + def print_output(self, output_file, local): """ Writing the aggregated organization output to file :param output_file: The output file we're writing to @@ -407,25 +408,29 @@ def print_output(self, output_file): "aliases": org_info.aliases, "parent": org_info.parent, "permid": org_info.permid, "market": org_info.market, "crunchbase": org_info.crunchbase, "child_crunchbase": org_info.child_crunchbase, - "grid": org_info.grid, "regex": org_info.regex, + "ror_id": org_info.ror, "regex": org_info.regex, "BGOV_id": org_info.bgov_id, "linkedin": org_info.linkedin, "in_sandp_500": org_info.in_sandp_500, "in_fortune_global_500": org_info.in_fortune_global_500, "comment": org_info.comment, "children": org_info.children, "non_agg_children": org_info.non_agg_children} out.write(json.dumps(js, ensure_ascii=False) + "\n") out.close() + if not local: + subprocess.run(["gsutil", "-m", "cp", "-r", output_file, "gs://airflow-data-exchange/ai_companies_visualization/tmp/"], check=True) -def main(): - parser = argparse.ArgumentParser() - parser.add_argument("output_file", type=str, help="A jsonl file for writing output data to create new tables") - args = parser.parse_args() - if not args.output_file.endswith(".jsonl"): - parser.print_help() + +def aggregate_organizations(output_file, local=False): aggregator = OrganizationAggregator() aggregator.get_parents() aggregator.get_organizations() - aggregator.print_output(args.output_file) + aggregator.print_output(output_file, local) if __name__ == "__main__": - main() + parser = argparse.ArgumentParser() + parser.add_argument("output_file", type=str, help="A jsonl file for writing output data to create new tables") + args = parser.parse_args() + if not args.output_file.endswith(".jsonl"): + parser.print_help() + aggregate_organizations(args.output_file, local=True) + diff --git a/company_linkage/all_papers.py b/company_linkage/parat_scripts/all_papers.py similarity index 92% rename from company_linkage/all_papers.py rename to company_linkage/parat_scripts/all_papers.py index 9fe5d534..f9234069 100644 --- a/company_linkage/all_papers.py +++ b/company_linkage/parat_scripts/all_papers.py @@ -18,7 +18,7 @@ def main() -> None: paper_finder.get_identifiers() # These are the only two lines that make this different from running AI pubs # We select from a different table - table_name = "ai_companies_visualization.all_publications" + table_name = "staging_ai_companies_visualization.all_publications" # And we write out our data to a different variable companies = paper_finder.run_query_papers(table_name, "all_pubs", by_year=True) paper_finder.write_output(companies, args.output_file) diff --git a/company_linkage/deduplicate_companies.py b/company_linkage/parat_scripts/deduplicate_companies.py similarity index 100% rename from company_linkage/deduplicate_companies.py rename to company_linkage/parat_scripts/deduplicate_companies.py diff --git a/company_linkage/get_ai_counts.py b/company_linkage/parat_scripts/get_ai_counts.py similarity index 90% rename from company_linkage/get_ai_counts.py rename to company_linkage/parat_scripts/get_ai_counts.py index 504099de..681f704d 100644 --- a/company_linkage/get_ai_counts.py +++ b/company_linkage/parat_scripts/get_ai_counts.py @@ -14,7 +14,7 @@ def __init__(self) -> None: AI papers in top conferences, etc.) and AI patents (from Dimensions and 1790 jointly). """ self.regex_dict = defaultdict(list) - self.grid_dict = defaultdict(list) + self.ror_dict = defaultdict(list) self.cset_ids = [] self.company_ids = [] self.patent_fields = ["Physical_Sciences_and_Engineering", @@ -58,7 +58,7 @@ def get_identifiers(self) -> None: Pulling the regular expressions used to find papers and patents through means other than GRID. :return: """ - regex_query = """SELECT CSET_id, regex, grid FROM + regex_query = """SELECT CSET_id, regex, ror_id FROM `gcp-cset-projects.high_resolution_entities.aggregated_organizations`""" client = bigquery.Client() query_job = client.query(regex_query) @@ -67,15 +67,15 @@ def get_identifiers(self) -> None: if result.regex: for regex in result.regex: self.regex_dict[result.CSET_id].append(regex) - if result.grid: - for grid_id in result.grid: - self.grid_dict[result.CSET_id].append(grid_id) + if result.ror_id: + for ror in result.ror_id: + self.ror_dict[result.CSET_id].append(ror) self.cset_ids.append(result.CSET_id) def run_query_papers(self, table_name: str, field_name: str, test: bool = False, by_year: bool = False) -> list: """ - Running a query to find paper counts using regex for papers missing GRID. This query combines - this data with preexisting paper counts already identified using SQL for papers that have GRID. + Running a query to find paper counts using regex for papers missing ROR. This query combines + this data with preexisting paper counts already identified using SQL for papers that have ROR. We no longer use this query for AI papers, but it is still used for top conference papers and total papers. :param table_name: The table to look for papers in @@ -100,8 +100,8 @@ def run_query_papers(self, table_name: str, field_name: str, test: bool = False, if len(regexes) > 1: for regex in regexes[1:]: query += f"""OR regexp_contains(org_name, r'(?i){regex}') """ - if cset_id in self.grid_dict: - query += f"""OR grid_id IN ({str(self.grid_dict[cset_id])[1:-1]})""" + if cset_id in self.ror_dict: + query += f"""OR ror_id IN ({str(self.ror_dict[cset_id])[1:-1]})""" query_job = client.query(query) # query_job is an iterator, so even though we're only returning one row we're going to loop for element in query_job: @@ -109,7 +109,7 @@ def run_query_papers(self, table_name: str, field_name: str, test: bool = False, # if we don't have total data, we won't have by_year either if by_year: row_dict[field_name_by_year] = self.run_query_papers_by_year(table_name, field_name, regexes, - self.grid_dict[cset_id]) + self.ror_dict[cset_id]) if not row_dict[field_name]: # if we end up without any papers, set that to be true row_dict[field_name] = 0 @@ -119,7 +119,7 @@ def run_query_papers(self, table_name: str, field_name: str, test: bool = False, companies.append(row_dict) return companies - def run_query_papers_by_year(self, table_name: str, field_name: str, regexes: list, grids: list) -> list: + def run_query_papers_by_year(self, table_name: str, field_name: str, regexes: list, rors: list) -> list: """ Getting the same paper count data, except split by year. We no longer use this query for AI papers, but it is still used for top conference papers and @@ -127,7 +127,7 @@ def run_query_papers_by_year(self, table_name: str, field_name: str, regexes: li :param table_name: The table to look for papers in :param field_name: The json field name :param regexes: The regexes for whichever CSET_id we're searching for - :param grids: The grids for whichever CSET_id we're searching for if they exist; otherwise an empty list + :param rors: The rors for whichever CSET_id we're searching for if they exist; otherwise an empty list :return: """ field_name_by_year = f"{field_name}_by_year" @@ -143,8 +143,8 @@ def run_query_papers_by_year(self, table_name: str, field_name: str, regexes: li for regex in regexes[1:]: # regex_to_use = rf"r'(?i){regex}'" query += f"""OR regexp_contains(org_name, r'(?i){regex}') """ - if grids: - query += f"""OR grid_id IN ({str(grids)[1:-1]}) """ + if rors: + query += f"""OR ror_id IN ({str(rors)[1:-1]}) """ query += """GROUP BY year ORDER BY year""" client = bigquery.Client() query_job = client.query(query) @@ -160,7 +160,7 @@ def run_query_id_papers(self, table_name: str, test: bool = False) -> list: :param test: False if not running as a unit test :return: """ - companies_query = f"""SELECT CSET_id, grid FROM + companies_query = f"""SELECT CSET_id, ror_id FROM `gcp-cset-projects.high_resolution_entities.aggregated_organizations`""" if test: companies_query += """ LIMIT 25""" @@ -180,9 +180,9 @@ def run_query_id_papers(self, table_name: str, test: bool = False) -> list: if len(regexes) > 1: for regex in regexes[1:]: query += f"""OR regexp_contains(org_name, r'(?i){regex}') """ - if row["grid"]: - self.grid_dict[row["CSET_id"]] = row["grid"] - query += f"""OR grid_id IN ({str(row["grid"])[1:-1]})""" + if row["ror_id"]: + self.ror_dict[row["CSET_id"]] = row["ror_id"] + query += f"""OR ror_id IN ({str(row["ror_id"])[1:-1]})""" query_job = client.query(query) # get all the merged ids for element in query_job: @@ -192,11 +192,15 @@ def run_query_id_papers(self, table_name: str, test: bool = False) -> list: return company_rows def run_query_id_patents(self): + """ + Get patent counts one by one using CSET_ids. + :return: + """ patent_companies = [] for cset_id in self.company_ids: if cset_id in self.regex_dict: regexes = self.regex_dict[cset_id] - grids = self.grid_dict[cset_id] + rors = self.ror_dict[cset_id] query = f"""SELECT DISTINCT family_id, priority_year, @@ -236,14 +240,14 @@ def run_query_id_patents(self): Machine_Learning, Search_Methods FROM - ai_companies_visualization.linked_ai_patents + staging_ai_companies_visualization.linked_ai_patents WHERE regexp_contains(assignee, r'(?i){regexes[0]}') """ # if we have more than one regex for an org, include all of them if len(regexes) > 1: for regex in regexes[1:]: query += f"""OR regexp_contains(assignee, r'(?i){regex}') """ - if grids: - query += f"""OR grid IN ({str(grids)[1:-1]})""" + if rors: + query += f"""OR ror_id IN ({str(rors)[1:-1]})""" client = bigquery.Client() query_job = client.query(query) for row in query_job: @@ -278,7 +282,7 @@ def main() -> None: count_getter = CountGetter() print("Fetching identifiers") count_getter.get_identifiers() - table_name = "gcp-cset-projects.ai_companies_visualization.ai_publications" + table_name = "gcp-cset-projects.staging_ai_companies_visualization.ai_publications" print("Fetching paper data") company_rows = count_getter.run_query_id_papers(table_name) print("Writing results") diff --git a/company_linkage/test_aggregate_organizations.py b/company_linkage/parat_scripts/test_aggregate_organizations.py similarity index 92% rename from company_linkage/test_aggregate_organizations.py rename to company_linkage/parat_scripts/test_aggregate_organizations.py index 76709ac6..c9e31397 100644 --- a/company_linkage/test_aggregate_organizations.py +++ b/company_linkage/parat_scripts/test_aggregate_organizations.py @@ -1,6 +1,6 @@ import os import unittest -from company_linkage import aggregate_organizations +import aggregate_organizations from collections import defaultdict @@ -18,7 +18,7 @@ def test_init(self): self.assertEqual(org.market, []) self.assertEqual(org.crunchbase, {}) self.assertEqual(org.child_crunchbase, []) - self.assertEqual(org.grid, []) + self.assertEqual(org.ror, []) self.assertEqual(org.regex, []) self.assertEqual(org.bgov_id, []) self.assertEqual(org.comment, None) @@ -116,30 +116,30 @@ def test_add_child_crunchbase(self): "https://www.crunchbase.com/organization/algorithmia") self.assertEqual(len(org.child_crunchbase), 2) - def test_add_grid(self): + def test_add_ror(self): org = aggregate_organizations.Organization(1, "test") - org.add_grid("grid.419660.c") - self.assertEqual(org.grid[0], "grid.419660.c") - self.assertEqual(len(org.grid), 1) + org.add_ror("https://ror.org/05a8p8995") + self.assertEqual(org.ror[0], "https://ror.org/05a8p8995") + self.assertEqual(len(org.ror), 1) # Don't add a duplicate entry! - org.add_grid("grid.419660.c") - self.assertEqual(len(org.grid), 1) + org.add_ror("https://ror.org/05a8p8995") + self.assertEqual(len(org.ror), 1) # Do add a new one - org.add_grid("grid.481863.0") - self.assertEqual(org.grid[1], "grid.481863.0") - self.assertEqual(len(org.grid), 2) + org.add_ror("https://ror.org/00kdbj440") + self.assertEqual(org.ror[1], "https://ror.org/00kdbj440") + self.assertEqual(len(org.ror), 2) def test_add_regex(self): org = aggregate_organizations.Organization(1, "test") - org.add_regex("^hhi\s+corporation$|^hhi$|^hhi\s+corp$") - self.assertEqual(org.regex[0], "^hhi\s+corporation$|^hhi$|^hhi\s+corp$") + org.add_regex(r"^hhi\s+corporation$|^hhi$|^hhi\s+corp$") + self.assertEqual(org.regex[0], r"^hhi\s+corporation$|^hhi$|^hhi\s+corp$") self.assertEqual(len(org.regex), 1) # Don't add a duplicate entry! - org.add_regex("^hhi\s+corporation$|^hhi$|^hhi\s+corp$") + org.add_regex(r"^hhi\s+corporation$|^hhi$|^hhi\s+corp$") self.assertEqual(len(org.regex), 1) # Do add a new one - org.add_regex("^hhi\s+corporation$") - self.assertEqual(org.regex[1], "^hhi\s+corporation$") + org.add_regex(r"^hhi\s+corporation$") + self.assertEqual(org.regex[1], r"^hhi\s+corporation$") self.assertEqual(len(org.regex), 2) def test_add_bgov_id(self): @@ -157,8 +157,8 @@ def test_add_bgov_id(self): def test_add_comment(self): org = aggregate_organizations.Organization(1, "test") - org.add_comment("grid id not available") - self.assertEqual(org.comment, "grid id not available") + org.add_comment("crunchbase id not available") + self.assertEqual(org.comment, "crunchbase id not available") other_org = aggregate_organizations.Organization(2, "test_2") other_org.add_comment("") self.assertEqual(other_org.comment, None) diff --git a/company_linkage/test_ai_counts.py b/company_linkage/parat_scripts/test_ai_counts.py similarity index 89% rename from company_linkage/test_ai_counts.py rename to company_linkage/parat_scripts/test_ai_counts.py index 83bf622b..926a2731 100644 --- a/company_linkage/test_ai_counts.py +++ b/company_linkage/parat_scripts/test_ai_counts.py @@ -1,5 +1,5 @@ import unittest -from company_linkage.get_ai_counts import CountGetter +from get_ai_counts import CountGetter import warnings @@ -22,7 +22,7 @@ def test_get_identifiers(self): count_getter.get_identifiers() # the dicts are populated self.assertGreater(len(count_getter.regex_dict), 0) - self.assertGreater(len(count_getter.grid_dict), 0) + self.assertGreater(len(count_getter.ror_dict), 0) self.assertGreater(len(count_getter.cset_ids), 0) self.assertEqual(type(count_getter.cset_ids), list) # the values in the dict are the correct type @@ -30,16 +30,16 @@ def test_get_identifiers(self): self.assertEqual(type(key_val), int) # we allow multiple regexes, so we have a list self.assertEqual(type(count_getter.regex_dict[key_val]), list) - for key_val in count_getter.grid_dict.keys(): + for key_val in count_getter.ror_dict.keys(): self.assertEqual(type(key_val), int) # we allow multiple regexes, so we have a list - self.assertEqual(type(count_getter.grid_dict[key_val]), list) + self.assertEqual(type(count_getter.ror_dict[key_val]), list) @ignore_warnings def test_run_query_papers(self): count_getter = CountGetter() count_getter.get_identifiers() - table_name = "gcp-cset-projects.ai_companies_visualization.ai_publications" + table_name = "gcp-cset-projects.staging_ai_companies_visualization.ai_publications" test = True companies = count_getter.run_query_papers(table_name, "ai_pubs", test=test, by_year=False) # Make sure we're setting the AI pubs for every company! @@ -67,7 +67,7 @@ def test_run_query_papers(self): def test_run_query_id_papers(self): count_getter = CountGetter() count_getter.get_identifiers() - table_name = "gcp-cset-projects.ai_companies_visualization.ai_publications" + table_name = "gcp-cset-projects.staging_ai_companies_visualization.ai_publications" test = True company_rows = count_getter.run_query_id_papers(table_name, test=test) for company_row in company_rows: @@ -84,7 +84,7 @@ def test_run_query_id_papers(self): def test_run_query_id_patents(self): count_getter = CountGetter() count_getter.get_identifiers() - table_name = "gcp-cset-projects.ai_companies_visualization.ai_publications" + table_name = "gcp-cset-projects.staging_ai_companies_visualization.ai_publications" test = True count_getter.run_query_id_papers(table_name, test) patent_companies = count_getter.run_query_id_patents() diff --git a/company_linkage/top_papers.py b/company_linkage/parat_scripts/top_papers.py similarity index 87% rename from company_linkage/top_papers.py rename to company_linkage/parat_scripts/top_papers.py index 962bcf97..bc1bbece 100644 --- a/company_linkage/top_papers.py +++ b/company_linkage/parat_scripts/top_papers.py @@ -1,6 +1,6 @@ import argparse -from company_linkage.get_ai_counts import CountGetter +from get_ai_counts import CountGetter def main() -> None: @@ -18,7 +18,7 @@ def main() -> None: paper_finder.get_identifiers() # These are the only two lines that make this different from running AI pubs # We select from a different table - table_name = "ai_companies_visualization.pubs_in_top_conferences" + table_name = "staging_ai_companies_visualization.pubs_in_top_conferences" # And we write out our data to a different variable companies = paper_finder.run_query_papers(table_name, "ai_pubs_in_top_conferences", by_year=True) paper_finder.write_output(companies, args.output_file) diff --git a/company_linkage/push_to_airflow.sh b/company_linkage/push_to_airflow.sh old mode 100644 new mode 100755 index ca988b32..675430e0 --- a/company_linkage/push_to_airflow.sh +++ b/company_linkage/push_to_airflow.sh @@ -3,10 +3,18 @@ gsutil rm -r gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/airtable_to_bq_confi gsutil cp -r airtable_configs/parat_preannotation gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/airtable_to_bq_config/ gsutil cp -r airtable_configs/parat_validate gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/airtable_to_bq_config/ -gsutil cp airtable_queries/parat_preannotation/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/bq_to_airtable/parat_preannotation/ -gsutil cp airtable_queries/parat_preannotation/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/bq_to_airtable/parat_validate/ gsutil cp airtable_queries/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/airtable_to_bq/parat_preannotation/ gsutil cp airtable_queries/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/airtable_to_bq/parat_validate/ gsutil cp airtable_schemas/parat_preannotation/* gs://airflow-data-exchange-development/schemas/airtable_to_bq/parat_preannotation/ -gsutil cp airtable_schemas/parat_validate/* gs://airflow-data-exchange-development/schemas/airtable_to_bq/parat_validate/ \ No newline at end of file +gsutil cp airtable_schemas/parat_validate/* gs://airflow-data-exchange-development/schemas/airtable_to_bq/parat_validate/ + +gsutil cp parat_data_dag.py gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/ +gsutil cp aggregate_organizations.py gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/ +gsutil cp sequences/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sequences/parat/ +gsutil rm gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/parat/* +gsutil cp sql/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/sql/parat/ +gsutil cp schemas/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/schemas/parat/ +gsutil rm -r gs://airflow-data-exchange/parat/schemas/* +gsutil cp schemas/* gs://airflow-data-exchange/parat/schemas/ +gsutil -m cp -r parat_scripts/* gs://us-east1-dev2023-cc1-b088c7e1-bucket/dags/parat_scripts/ \ No newline at end of file diff --git a/company_linkage/requirements.txt b/company_linkage/requirements.txt new file mode 100644 index 00000000..10b51281 --- /dev/null +++ b/company_linkage/requirements.txt @@ -0,0 +1,56 @@ +attrs==21.2.0 +cachetools==4.1.1 +certifi==2020.6.20 +cffi==1.14.3 +chardet==3.0.4 +coverage==5.5 +google-api-core==1.30.0 +google-auth==1.30.2 +google-auth-oauthlib==0.4.4 +google-cloud-bigquery==2.20.0 +google-cloud-bigquery-storage==2.4.0 +google-cloud-core==1.6.0 +google-cloud-translate==3.2.0 +google-crc32c==1.1.2 +google-resumable-media==1.3.0 +googleapis-common-protos==1.53.0 +grpcio==1.33.1 +idna==2.10 +iniconfig==1.1.1 +libcst==0.3.13 +mypy-extensions==0.4.3 +numpy==1.20.3 +oauthlib==3.1.0 +packaging==20.9 +pandas==1.1.3 +pandas-gbq==0.14.0 +Pillow==8.2.0 +pluggy==0.13.1 +pprintpp==0.4.0 +proto-plus==1.11.0 +protobuf==3.13.0 +py==1.10.0 +pyarrow==3.0.0 +pyasn1==0.4.8 +pyasn1-modules==0.2.8 +pycld2==0.41 +pycountry==20.7.3 +pycountry-convert==0.7.2 +pycparser==2.20 +pydata-google-auth==1.1.0 +pyparsing==2.4.7 +pytest==6.2.4 +pytest-cov==2.12.1 +pytest-mock==3.6.1 +python-dateutil==2.8.1 +pytz==2020.1 +PyYAML==5.3.1 +repoze.lru==0.7 +requests==2.24.0 +requests-oauthlib==1.3.0 +rsa==4.6 +six==1.15.0 +toml==0.10.2 +typing-extensions==3.7.4.3 +typing-inspect==0.6.0 +urllib3==1.25.11 diff --git a/company_linkage/schemas/aggregated_organizations_schema.json b/company_linkage/schemas/aggregated_organizations.json similarity index 99% rename from company_linkage/schemas/aggregated_organizations_schema.json rename to company_linkage/schemas/aggregated_organizations.json index 40cf0ef4..22aa3e28 100644 --- a/company_linkage/schemas/aggregated_organizations_schema.json +++ b/company_linkage/schemas/aggregated_organizations.json @@ -157,9 +157,9 @@ }, { "mode": "REPEATED", - "name": "grid", + "name": "ror_id", "type": "STRING", - "description": "The company's GRID identifier." + "description": "The company's ROR identifier." }, { "mode": "REPEATED", diff --git a/company_linkage/schemas/paper_visualization_data.json b/company_linkage/schemas/paper_visualization_data.json new file mode 100644 index 00000000..cc9e5aab --- /dev/null +++ b/company_linkage/schemas/paper_visualization_data.json @@ -0,0 +1,128 @@ +[ + { + "mode": "NULLABLE", + "name": "CSET_id", + "type": "INTEGER", + "description": "CSET id of PARAT company." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year cited." + }, + { + "mode": "NULLABLE", + "name": "citation_count", + "type": "INTEGER", + "description": "Count of publications in that year that cite AI papers written by the company." + } + ], + "mode": "REPEATED", + "name": "citation_count_by_year", + "type": "RECORD", + "description": "Citations of AI papers by the company by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "field_name", + "type": "STRING", + "description": "Field of study name." + }, + { + "mode": "NULLABLE", + "name": "field_count", + "type": "INTEGER", + "description": "Count of AI papers by the company where field of study is in their top fields." + } + ], + "mode": "REPEATED", + "name": "fields", + "type": "RECORD", + "description": "Fields of study counts (using MAG-style fields of study for AI-relevant fields)." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "cluster_id", + "type": "INTEGER", + "description": "Map of Science research cluster ID." + }, + { + "mode": "NULLABLE", + "name": "cluster_count", + "type": "INTEGER", + "description": "Count of how many AI publications from the company appear in that cluster." + } + ], + "mode": "REPEATED", + "name": "clusters", + "type": "RECORD", + "description": "Counts of top publications in research clusters." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "ref_CSET_id", + "type": "INTEGER", + "description": "CSET id of referenced PARAT company." + }, + { + "mode": "NULLABLE", + "name": "referenced_count", + "type": "INTEGER", + "description": "Count of how many AI publications by that company the primary PARAT company has referenced in their papers." + } + ], + "mode": "REPEATED", + "name": "company_references", + "type": "RECORD", + "description": "Counts of publication references to the publications of other companies in the PARAT dataset." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "referent", + "type": "STRING", + "description": "The task name referent." + }, + { + "mode": "NULLABLE", + "name": "task_count", + "type": "INTEGER", + "description": "Count of how many AI publications by the company contain this task." + } + ], + "mode": "REPEATED", + "name": "tasks", + "type": "RECORD", + "description": "AI task information." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "referent", + "type": "STRING", + "description": "The method name referent." + }, + { + "mode": "NULLABLE", + "name": "method_count", + "type": "INTEGER", + "description": "Count of how many AI publications by the company contain this method." + } + ], + "mode": "REPEATED", + "name": "methods", + "type": "RECORD", + "description": "AI method information." + } +] \ No newline at end of file diff --git a/company_linkage/schemas/patent_visualization_data.json b/company_linkage/schemas/patent_visualization_data.json new file mode 100644 index 00000000..42496163 --- /dev/null +++ b/company_linkage/schemas/patent_visualization_data.json @@ -0,0 +1,950 @@ +[ + { + "mode": "NULLABLE", + "name": "CSET_id", + "type": "INTEGER", + "description": "CSET id of PARAT company." + }, + { + "mode": "NULLABLE", + "name": "name", + "type": "STRING", + "description": "Name of PARAT company." + }, + { + "mode": "NULLABLE", + "name": "ai_patents", + "type": "INTEGER", + "description": "Total AI patent families." + }, + { + "mode": "NULLABLE", + "name": "Physical_Sciences_and_Engineering_pats", + "type": "INTEGER", + "description": "AI patent families in physical science and engineering application category." + }, + { + "mode": "NULLABLE", + "name": "Life_Sciences_pats", + "type": "INTEGER", + "description": "AI patent families in life sciences application category." + }, + { + "mode": "NULLABLE", + "name": "Security__eg_cybersecurity_pats", + "type": "INTEGER", + "description": "AI patent families in security (e.g. cybersecurity) application category." + }, + { + "mode": "NULLABLE", + "name": "Transportation_pats", + "type": "INTEGER", + "description": "AI patent families in transportation application category." + }, + { + "mode": "NULLABLE", + "name": "Industrial_and_Manufacturing_pats", + "type": "INTEGER", + "description": "AI patent families in industrial and manufacturing application category." + }, + { + "mode": "NULLABLE", + "name": "Education_pats", + "type": "INTEGER", + "description": "AI patent families in education application category." + }, + { + "mode": "NULLABLE", + "name": "Document_Mgt_and_Publishing_pats", + "type": "INTEGER", + "description": "AI patent families in document management and publishing application category." + }, + { + "mode": "NULLABLE", + "name": "Military_pats", + "type": "INTEGER", + "description": "AI patent families in military application category." + }, + { + "mode": "NULLABLE", + "name": "Agricultural_pats", + "type": "INTEGER", + "description": "AI patent families in agricultural application category." + }, + { + "mode": "NULLABLE", + "name": "Computing_in_Government_pats", + "type": "INTEGER", + "description": "AI patent families in computing in government application category." + }, + { + "mode": "NULLABLE", + "name": "Personal_Devices_and_Computing_pats", + "type": "INTEGER", + "description": "AI patent families in personal devices and computing application category." + }, + { + "mode": "NULLABLE", + "name": "Banking_and_Finance_pats", + "type": "INTEGER", + "description": "AI patent families in banking and finance application category." + }, + { + "mode": "NULLABLE", + "name": "Telecommunications_pats", + "type": "INTEGER", + "description": "AI patent families in telecommunications application category." + }, + { + "mode": "NULLABLE", + "name": "Networks__eg_social_IOT_etc_pats", + "type": "INTEGER", + "description": "AI patent families in networks (e.g. social, IOT, etc.) application category." + }, + { + "mode": "NULLABLE", + "name": "Business_pats", + "type": "INTEGER", + "description": "AI patent families in business application category." + }, + { + "mode": "NULLABLE", + "name": "Energy_Management_pats", + "type": "INTEGER", + "description": "AI patent families in energy management application category." + }, + { + "mode": "NULLABLE", + "name": "Entertainment_pats", + "type": "INTEGER", + "description": "AI patent families in entertainment application category." + }, + { + "mode": "NULLABLE", + "name": "Nanotechnology_pats", + "type": "INTEGER", + "description": "AI patent families in nanotechnology application category." + }, + { + "mode": "NULLABLE", + "name": "Semiconductors_pats", + "type": "INTEGER", + "description": "AI patent families in semiconductors application category." + }, + { + "mode": "NULLABLE", + "name": "Language_Processing_pats", + "type": "INTEGER", + "description": "AI patent families in language processing functional application category." + }, + { + "mode": "NULLABLE", + "name": "Speech_Processing_pats", + "type": "INTEGER", + "description": "AI patent families in speech processing functional application category." + }, + { + "mode": "NULLABLE", + "name": "Knowledge_Representation_pats", + "type": "INTEGER", + "description": "AI patent families in knowledge representation functional application category." + }, + { + "mode": "NULLABLE", + "name": "Planning_and_Scheduling_pats", + "type": "INTEGER", + "description": "AI patent families in planning and scheduling functional application category." + }, + { + "mode": "NULLABLE", + "name": "Control_pats", + "type": "INTEGER", + "description": "AI patent families in control functional application category." + }, + { + "mode": "NULLABLE", + "name": "Distributed_AI_pats", + "type": "INTEGER", + "description": "AI patent families in distributed AI functional application category." + }, + { + "mode": "NULLABLE", + "name": "Robotics_pats", + "type": "INTEGER", + "description": "AI patent families in robotics functional application category." + }, + { + "mode": "NULLABLE", + "name": "Computer_Vision_pats", + "type": "INTEGER", + "description": "AI patent families in computer vision functional application category." + }, + { + "mode": "NULLABLE", + "name": "Analytics_and_Algorithms_pats", + "type": "INTEGER", + "description": "AI patent families in analytics and algorithms functional application category." + }, + { + "mode": "NULLABLE", + "name": "Measuring_and_Testing_pats", + "type": "INTEGER", + "description": "AI patent families in measuring and testing functional application category." + }, + { + "mode": "NULLABLE", + "name": "Logic_Programming_pats", + "type": "INTEGER", + "description": "AI patent families in logic programming AI techniques category." + }, + { + "mode": "NULLABLE", + "name": "Fuzzy_Logic_pats", + "type": "INTEGER", + "description": "AI patent families in fuzzy logic AI techniques category." + }, + { + "mode": "NULLABLE", + "name": "Probabilistic_Reasoning_pats", + "type": "INTEGER", + "description": "AI patent families in probabilistic reasoning AI techniques category." + }, + { + "mode": "NULLABLE", + "name": "Ontology_Engineering_pats", + "type": "INTEGER", + "description": "AI patent families in ontology engineering AI techniques category." + }, + { + "mode": "NULLABLE", + "name": "Machine_Learning_pats", + "type": "INTEGER", + "description": "AI patent families in machine learning AI techniques category." + }, + { + "mode": "NULLABLE", + "name": "Search_Methods_pats", + "type": "INTEGER", + "description": "AI patent families in search methods AI techniques category." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "ai_patents", + "type": "INTEGER", + "description": "AI patent families count for that year." + } + ], + "mode": "REPEATED", + "name": "ai_patents_by_year", + "type": "RECORD", + "description": "Count of total AI patent families by priority year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Physical_Sciences_and_Engineering_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the physical science and engineering application category for that year." + } + ], + "mode": "REPEATED", + "name": "Physical_Sciences_and_Engineering_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the physical science and engineering application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Life_Sciences_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the life sciences application category for that year." + } + ], + "mode": "REPEATED", + "name": "Life_Sciences_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the life sciences application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Security__eg_cybersecurity_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the security (e.g. cybersecurity) application category for that year." + } + ], + "mode": "REPEATED", + "name": "Security__eg_cybersecurity_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the security (e.g. cybersecurity) application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Transportation_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the transportation application category for that year." + } + ], + "mode": "REPEATED", + "name": "Transportation_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the transportation application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Industrial_and_Manufacturing_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the industrial and manufacturing application category for that year." + } + ], + "mode": "REPEATED", + "name": "Industrial_and_Manufacturing_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the industrial and manufacturing application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Education_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the education application category for that year." + } + ], + "mode": "REPEATED", + "name": "Education_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the education application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Document_Mgt_and_Publishing_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the document management and publishing application category for that year." + } + ], + "mode": "REPEATED", + "name": "Document_Mgt_and_Publishing_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the document management and publishing application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Military_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the military application category for that year." + } + ], + "mode": "REPEATED", + "name": "Military_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the military application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Agricultural_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the agricultural application category for that year." + } + ], + "mode": "REPEATED", + "name": "Agricultural_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the agricultural application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Computing_in_Government_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the computing in government application category for that year." + } + ], + "mode": "REPEATED", + "name": "Computing_in_Government_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the computing in government application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Personal_Devices_and_Computing_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the personal devices and computing application category for that year." + } + ], + "mode": "REPEATED", + "name": "Personal_Devices_and_Computing_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the personal devices and computing application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Banking_and_Finance_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the banking and finance application category for that year." + } + ], + "mode": "REPEATED", + "name": "Banking_and_Finance_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the banking and finance application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Telecommunications_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the telecommunications application category for that year." + } + ], + "mode": "REPEATED", + "name": "Telecommunications_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the telecommunications application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Networks__eg_social_IOT_etc_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the networks (e.g. social, IOT, etc.) application category for that year." + } + ], + "mode": "REPEATED", + "name": "Networks__eg_social_IOT_etc_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the networks (e.g. social, IOT, etc.) application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Business_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the business application category for that year." + } + ], + "mode": "REPEATED", + "name": "Business_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the business application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Energy_Management_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the energy management application category for that year." + } + ], + "mode": "REPEATED", + "name": "Energy_Management_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the energy management application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Entertainment_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the entertainment application category for that year." + } + ], + "mode": "REPEATED", + "name": "Entertainment_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the entertainment application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Nanotechnology_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the nanotechnology application category for that year." + } + ], + "mode": "REPEATED", + "name": "Nanotechnology_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the nanotechnology application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Semiconductors_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the semiconductors application category for that year." + } + ], + "mode": "REPEATED", + "name": "Semiconductors_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the semiconductors application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Language_Processing_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the language processing functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Language_Processing_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the language processing functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Speech_Processing_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the speech processing functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Speech_Processing_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the speech processing functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Knowledge_Representation_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the knowledge representation functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Knowledge_Representation_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the knowledge representation functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Planning_and_Scheduling_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the planning and scheduling functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Planning_and_Scheduling_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the planning and scheduling functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Control_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the control functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Control_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the control functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Distributed_AI_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the distributed AI functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Distributed_AI_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the distributed AI functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Robotics_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the robotics functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Robotics_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the robotics functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Computer_Vision_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the computer vision functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Computer_Vision_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the computer vision functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Analytics_and_Algorithms_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the analytics and algorithms functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Analytics_and_Algorithms_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the analytics and engineering functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Measuring_and_Testing_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the measuring and testing functional application category for that year." + } + ], + "mode": "REPEATED", + "name": "Measuring_and_Testing_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the measuring and testing functional application category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Logic_Programming_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the logic programming AI techniques category for that year." + } + ], + "mode": "REPEATED", + "name": "Logic_Programming_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the logic programming AI techniques category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Fuzzy_Logic_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the fuzzy logic AI techniques category for that year." + } + ], + "mode": "REPEATED", + "name": "Fuzzy_Logic_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the fuzzy logic AI techniques category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Probabilistic_Reasoning_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the probabilistic reasoning AI techniques category for that year." + } + ], + "mode": "REPEATED", + "name": "Probabilistic_Reasoning_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the probabilistic reasoning AI techniques category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Ontology_Engineering_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the ontology engineering AI techniques category for that year." + } + ], + "mode": "REPEATED", + "name": "Ontology_Engineering_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the ontology engineering AI techniques category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Machine_Learning_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the machine learning AI techniques category for that year." + } + ], + "mode": "REPEATED", + "name": "Machine_Learning_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the machine learning AI techniques category by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "priority_year", + "type": "INTEGER", + "description": "Priority year of AI patent family." + }, + { + "mode": "NULLABLE", + "name": "Search_Methods_pats", + "type": "INTEGER", + "description": "Count of AI patent families in the search methods AI techniques category for that year." + } + ], + "mode": "REPEATED", + "name": "Search_Methods_pats_by_year", + "type": "RECORD", + "description": "Count of AI patents in the search methods AI techniques category by year." + } +] \ No newline at end of file diff --git a/company_linkage/schemas/visualization_data.json b/company_linkage/schemas/visualization_data.json new file mode 100644 index 00000000..b34fc089 --- /dev/null +++ b/company_linkage/schemas/visualization_data.json @@ -0,0 +1,376 @@ +[ + { + "mode": "NULLABLE", + "name": "CSET_id", + "type": "INTEGER", + "description": "CSET id of PARAT company." + }, + { + "mode": "NULLABLE", + "name": "name", + "type": "STRING", + "description": "Name of PARAT company." + }, + { + "mode": "NULLABLE", + "name": "country", + "type": "STRING", + "description": "Country of PARAT company. If company is located in multiple countries, country of headquarters." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "alias_language", + "type": "STRING", + "description": "Language alias is written in." + }, + { + "mode": "NULLABLE", + "name": "alias", + "type": "STRING", + "description": "Alias of company." + } + ], + "mode": "REPEATED", + "name": "aliases", + "type": "RECORD", + "description": "List of company aliases." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "parent_acquisition", + "type": "BOOLEAN", + "description": "Boolean indicating whether the company was acquired by its parent company." + }, + { + "mode": "NULLABLE", + "name": "parent_name", + "type": "STRING", + "description": "Name of parent company." + }, + { + "mode": "NULLABLE", + "name": "parent_id", + "type": "INTEGER", + "description": "CSET id of parent company." + } + ], + "mode": "REPEATED", + "name": "parent", + "type": "RECORD", + "description": "List of parent companies." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "child_name", + "type": "STRING", + "description": "Name of child company." + }, + { + "mode": "NULLABLE", + "name": "child_id", + "type": "INTEGER", + "description": "CSET id of child company." + } + ], + "mode": "REPEATED", + "name": "children", + "type": "RECORD", + "description": "List of child companies." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "child_name", + "type": "STRING", + "description": "Name of child company." + }, + { + "mode": "NULLABLE", + "name": "child_id", + "type": "INTEGER", + "description": "CSET id of child companies." + } + ], + "mode": "REPEATED", + "name": "non_agg_children", + "type": "RECORD", + "description": "Name of child company whose data has not been aggregated into the records of the parent company." + }, + { + "mode": "REPEATED", + "name": "permid", + "type": "INTEGER", + "description": "Refinitiv Permid." + }, + { + "mode": "NULLABLE", + "name": "website", + "type": "STRING", + "description": "Company website." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "exchange", + "type": "STRING", + "description": "Exchange on which the company is listed." + }, + { + "mode": "NULLABLE", + "name": "ticker", + "type": "STRING", + "description": "Company ticker." + } + ], + "mode": "REPEATED", + "name": "market", + "type": "RECORD", + "description": "Company exchange and ticker data." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "crunchbase_uuid", + "type": "STRING", + "description": "UUID in Crunchbase." + }, + { + "mode": "NULLABLE", + "name": "crunchbase_url", + "type": "STRING", + "description": "URL on Crunchbase website." + } + ], + "mode": "NULLABLE", + "name": "crunchbase", + "type": "RECORD", + "description": "Crunchbase unique identifier." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "crunchbase_uuid", + "type": "STRING", + "description": "UUID in Crunchbase." + }, + { + "mode": "NULLABLE", + "name": "crunchbase_url", + "type": "STRING", + "description": "URL on Crunchbase website." + } + ], + "mode": "REPEATED", + "name": "child_crunchbase", + "type": "RECORD", + "description": "Crunchbase unique identifiers for any child companies of the aggregated company." + }, + { + "mode": "REPEATED", + "name": "ror_id", + "type": "STRING", + "description": "ROR id for the company." + }, + { + "mode": "REPEATED", + "name": "linkedin", + "type": "STRING", + "description": "LinkedIn website for the company." + }, + { + "mode": "NULLABLE", + "name": "in_sandp_500", + "type": "BOOLEAN", + "description": "Indicator of whether the company was in the S&P 500 at some point during 2020." + }, + { + "mode": "NULLABLE", + "name": "in_fortune_global_500", + "type": "BOOLEAN", + "description": "Indicator of whether the company was on the 2021 Fortune Global 500 list." + }, + { + "mode": "NULLABLE", + "name": "ai_pubs", + "type": "INTEGER", + "description": "Count of total AI publications by the company." + }, + { + "mode": "NULLABLE", + "name": "cv_pubs", + "type": "INTEGER", + "description": "Count of total computer vision publications by the company." + }, + { + "mode": "NULLABLE", + "name": "nlp_pubs", + "type": "INTEGER", + "description": "Count of total natural language processing publications by the company." + }, + { + "mode": "NULLABLE", + "name": "robotics_pubs", + "type": "INTEGER", + "description": "Count of total robotics publications by the company." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year published." + }, + { + "mode": "NULLABLE", + "name": "ai_pubs", + "type": "INTEGER", + "description": "Count of total AI publications by the company in that year." + } + ], + "mode": "REPEATED", + "name": "ai_pubs_by_year", + "type": "RECORD", + "description": "Counts of AI publications by the company by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year published." + }, + { + "mode": "NULLABLE", + "name": "cv_pubs", + "type": "INTEGER", + "description": "Count of total computer vision publications by the company in that year." + } + ], + "mode": "REPEATED", + "name": "cv_pubs_by_year", + "type": "RECORD", + "description": "Counts of computer vision publications by the company by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year published." + }, + { + "mode": "NULLABLE", + "name": "nlp_pubs", + "type": "INTEGER", + "description": "Count of total natural language processing publications by the company in that year." + } + ], + "mode": "REPEATED", + "name": "nlp_pubs_by_year", + "type": "RECORD", + "description": "Counts of natural language processing publications by the company by year." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year published." + }, + { + "mode": "NULLABLE", + "name": "robotics_pubs", + "type": "INTEGER", + "description": "Count of total robotics publications by the company in that year." + } + ], + "mode": "REPEATED", + "name": "robotics_pubs_by_year", + "type": "RECORD", + "description": "Counts of robotics publications by the company by year." + }, + { + "mode": "NULLABLE", + "name": "ai_pubs_in_top_conferences", + "type": "INTEGER", + "description": "Counts of total AI publications by the company that were published in top AI conferences." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year published." + }, + { + "mode": "NULLABLE", + "name": "ai_pubs_in_top_conferences", + "type": "INTEGER", + "description": "Count of total AI publications by the company that were published in top AI conferences in that year." + } + ], + "mode": "REPEATED", + "name": "ai_pubs_in_top_conferences_by_year", + "type": "RECORD", + "description": "Counts of AI publications in top conferences by the company by year." + }, + { + "mode": "NULLABLE", + "name": "all_pubs", + "type": "INTEGER", + "description": "Count of total publications by the company." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Year published." + }, + { + "mode": "NULLABLE", + "name": "all_pubs", + "type": "INTEGER", + "description": "Counts of total publications by the company in that year." + } + ], + "mode": "REPEATED", + "name": "all_pubs_by_year", + "type": "RECORD", + "description": "Counts of publications by the company by year." + }, + { + "mode": "NULLABLE", + "name": "short_description", + "type": "STRING", + "description": "Short description of the company, as drawn from Crunchbase's free interface." + }, + { + "mode": "NULLABLE", + "name": "logo_url", + "type": "STRING", + "description": "URL linking to a picture of the logo of the company, as drawn from Crunchbase's free interface." + }, + { + "mode": "NULLABLE", + "name": "stage", + "type": "STRING", + "description": "Maturity stage of a company." + } +] \ No newline at end of file diff --git a/company_linkage/schemas/workforce_visualization_data.json b/company_linkage/schemas/workforce_visualization_data.json new file mode 100644 index 00000000..f90a8022 --- /dev/null +++ b/company_linkage/schemas/workforce_visualization_data.json @@ -0,0 +1,20 @@ +[ + { + "mode": "NULLABLE", + "name": "cset_id", + "type": "INTEGER", + "description": "CSET id of PARAT company." + }, + { + "mode": "NULLABLE", + "name": "tt1_jobs", + "type": "INTEGER", + "description": "AI jobs as defined under CSET's technical track 1 definition." + }, + { + "mode": "NULLABLE", + "name": "ai_jobs", + "type": "INTEGER", + "description": "AI jobs as defined under a narrower definition within CSET's technical track 1 definition, focused specifically on research and implementation jobs within technical track 1." + } +] \ No newline at end of file diff --git a/company_linkage/sequences.txt b/company_linkage/sequences.txt deleted file mode 100644 index e69de29b..00000000 diff --git a/company_linkage/sequences/initial_data.csv b/company_linkage/sequences/initial_data.csv new file mode 100644 index 00000000..c1a10432 --- /dev/null +++ b/company_linkage/sequences/initial_data.csv @@ -0,0 +1,6 @@ +high_resolution_entities,organizations +staging_ai_companies_visualization,ai_publications +staging_ai_companies_visualization,linked_ai_patents +staging_ai_companies_visualization,top_conference_pubs +staging_ai_companies_visualization,pubs_in_top_conferences +staging_ai_companies_visualization,all_publications \ No newline at end of file diff --git a/company_linkage/sequences/visualization_data.csv b/company_linkage/sequences/visualization_data.csv new file mode 100644 index 00000000..09ae1630 --- /dev/null +++ b/company_linkage/sequences/visualization_data.csv @@ -0,0 +1,19 @@ +staging_ai_companies_visualization,initial_visualization_data +staging_ai_companies_visualization,visualization_data_with_by_year +staging_ai_companies_visualization,visualization_data_with_top_papers +staging_ai_companies_visualization,visualization_data_with_all_papers +staging_ai_companies_visualization,initial_patent_visualization_data +staging_ai_companies_visualization,patent_visualization_data_with_by_year +staging_ai_companies_visualization,initial_paper_visualization_data +staging_ai_companies_visualization,paper_visualization_data_with_mag +staging_ai_companies_visualization,paper_visualization_data_with_clusters +staging_ai_companies_visualization,paper_visualization_data_with_company_references +staging_ai_companies_visualization,paper_visualization_data_with_tasks +staging_ai_companies_visualization,paper_visualization_data_with_methods +staging_ai_companies_visualization,initial_workforce_visualization_data +staging_ai_companies_visualization,workforce_visualization_data_with_ai_jobs +staging_ai_companies_visualization,visualization_data_omit_by_rule +staging_ai_companies_visualization,visualization_data +staging_ai_companies_visualization,patent_visualization_data +staging_ai_companies_visualization,paper_visualization_data +staging_ai_companies_visualization,workforce_visualization_data \ No newline at end of file diff --git a/company_linkage/sql/adding_paper_patent_data.sql b/company_linkage/sql/adding_paper_patent_data.sql deleted file mode 100644 index b075c7c7..00000000 --- a/company_linkage/sql/adding_paper_patent_data.sql +++ /dev/null @@ -1,34 +0,0 @@ --- DEPRECATED, REMOVE SOON --- Update the visualization table itself to add paper and patent data -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS - -- Pull in the paper and patent counts, along with the CSET ids to link them in -WITH - count_data AS ( - SELECT - CSET_id, - ai_pubs, - ai_pubs_by_year, - ai_patents, - ai_patents_by_year - FROM - `gcp-cset-projects.ai_companies_visualization.paper_patent_counts`), - -- Pull in the current visualization data. Exclude the ai_pubs data, since that was included when we built the paper/patent data, so we don't need it - viz_data AS ( - SELECT - * EXCEPT(ai_pubs, ai_pubs_by_year) - FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data`) - -- Join the two together using the CSET id -SELECT - viz_data.*, - ai_pubs, - ai_pubs_by_year, - ai_patents, - ai_patents_by_year -FROM - viz_data -LEFT JOIN - count_data -ON - viz_data.CSET_id = count_data.CSET_id \ No newline at end of file diff --git a/company_linkage/sql/ai_publications.sql b/company_linkage/sql/ai_publications.sql new file mode 100644 index 00000000..4c87164a --- /dev/null +++ b/company_linkage/sql/ai_publications.sql @@ -0,0 +1,59 @@ +WITH + ai_papers AS ( + SELECT + merged_id, + cv_filtered, + nlp_filtered, + robotics_filtered + FROM + gcp-cset-projects.article_classification.predictions + WHERE + ai_filtered = TRUE OR cv_filtered = TRUE OR nlp_filtered = TRUE OR robotics_filtered = TRUE), + ror AS ( + -- Adding in org names and country data using ROR + SELECT + id, + ror.name AS org_name, + standard_name AS country + FROM + gcp_cset_ror.ror + LEFT JOIN + countries.country_code + ON lower(country.country_code) = lower(country_code.raw_alpha_2)), + merged_rors AS ( + -- Selecting all the merged ids and ror ids from the literature table + SELECT + DISTINCT + merged_id, + ror_id, + org_name, + cv_filtered as cv, + nlp_filtered as nlp, + robotics_filtered as robotics + FROM + literature.affiliations + -- if they're AI papers + INNER JOIN ai_papers + USING (merged_id)), + article_years AS ( + SELECT + merged_id, + year + FROM + literature.papers) +SELECT + -- Adding in the org name and country associated with the ror id + merged_rors.* EXCEPT (org_name), + COALESCE(ror.org_name, merged_rors.org_name) as org_name, + country, + year +FROM + merged_rors +LEFT JOIN + ror +ON + merged_rors.ror_id = ror.id +LEFT JOIN + article_years +ON + merged_rors.merged_id = article_years.merged_id \ No newline at end of file diff --git a/company_linkage/sql/selecting_all_publications.sql b/company_linkage/sql/all_publications.sql similarity index 93% rename from company_linkage/sql/selecting_all_publications.sql rename to company_linkage/sql/all_publications.sql index 6631575f..e11ef86f 100644 --- a/company_linkage/sql/selecting_all_publications.sql +++ b/company_linkage/sql/all_publications.sql @@ -1,6 +1,4 @@ - -- Pulling every publication id linked to every author affiliate and all years because we'll want those later for yearly counts -CREATE OR REPLACE TABLE - ai_companies_visualization.all_publications AS + -- Pulling every publication id linked to every author affiliate and all years because we'll want those later for yearly count WITH ror AS ( -- Adding in org names and country data using ROR diff --git a/company_linkage/sql/check_all_paper_counts_greater.sql b/company_linkage/sql/check_all_paper_counts_greater.sql new file mode 100644 index 00000000..f3e20b8b --- /dev/null +++ b/company_linkage/sql/check_all_paper_counts_greater.sql @@ -0,0 +1,5 @@ +SELECT + LOGICAL_AND(all_pubs >= ai_pubs) + AND LOGICAL_AND(all_pubs >= ai_pubs_in_top_conferences) +FROM + staging_ai_companies_visualization.visualization_data_with_all_papers \ No newline at end of file diff --git a/company_linkage/sql/check_paper_counts_exist.sql b/company_linkage/sql/check_paper_counts_exist.sql new file mode 100644 index 00000000..4171ab87 --- /dev/null +++ b/company_linkage/sql/check_paper_counts_exist.sql @@ -0,0 +1,11 @@ +SELECT + COUNT(*) = 0 +FROM + staging_ai_companies_visualization.visualization_data_with_all_papers +WHERE + ai_pubs IS NULL + OR robotics_pubs IS NULL + OR cv_pubs IS NULL + OR nlp_pubs IS NULL + OR ai_pubs_in_top_conferences IS NULL + OR all_pubs IS NULL \ No newline at end of file diff --git a/company_linkage/sql/check_paper_have_all_ids_pre_omit.sql b/company_linkage/sql/check_paper_have_all_ids_pre_omit.sql new file mode 100644 index 00000000..3b33804a --- /dev/null +++ b/company_linkage/sql/check_paper_have_all_ids_pre_omit.sql @@ -0,0 +1,11 @@ +-- Check that the paper visualization data table has all the CSET organization ids in the table before +-- we run omit by rule +SELECT + COUNT(DISTINCT paper_visualization_data_with_methods.CSET_id) = COUNT(DISTINCT aggregated_organizations.CSET_id) + AND LOGICAL_AND(paper_visualization_data_with_methods.CSET_id IS NOT NULL) +FROM + staging_ai_companies_visualization.paper_visualization_data_with_methods +FULL OUTER JOIN + high_resolution_entities.aggregated_organizations +USING + (CSET_id) \ No newline at end of file diff --git a/company_linkage/sql/check_patent_counts_exist.sql b/company_linkage/sql/check_patent_counts_exist.sql new file mode 100644 index 00000000..e90821f6 --- /dev/null +++ b/company_linkage/sql/check_patent_counts_exist.sql @@ -0,0 +1,40 @@ +SELECT + COUNT(*) = 0 +FROM + staging_ai_companies_visualization.patent_visualization_data_with_by_year +WHERE + ai_patents IS NULL + OR Physical_Sciences_and_Engineering_pats IS NULL + OR Life_Sciences_pats IS NULL + OR Security__eg_cybersecurity_pats IS NULL + OR Transportation_pats IS NULL + OR Education_pats IS NULL + OR Document_Mgt_and_Publishing_pats IS NULL + OR Military_pats IS NULL + OR Agricultural_pats IS NULL + OR Computing_in_Government_pats IS NULL + OR Personal_Devices_and_Computing_pats IS NULL + OR Banking_and_Finance_pats IS NULL + OR Telecommunications_pats IS NULL + OR Networks__eg_social_IOT_etc_pats IS NULL + OR Business_pats IS NULL + OR Energy_Management_pats IS NULL + OR Entertainment_pats IS NULL + OR Nanotechnology_pats IS NULL + OR Semiconductors_pats IS NULL + OR Language_Processing_pats IS NULL + OR Speech_Processing_pats IS NULL + OR Knowledge_Representation_pats IS NULL + OR Planning_and_Scheduling_pats IS NULL + OR Control_pats IS NULL + OR Distributed_AI_pats IS NULL + OR Robotics_pats IS NULL + OR Computer_Vision_pats IS NULL + OR Analytics_and_Algorithms_pats IS NULL + OR Measuring_and_Testing_pats IS NULL + OR Logic_Programming_pats IS NULL + OR Fuzzy_Logic_pats IS NULL + OR Probabilistic_Reasoning_pats IS NULL + OR Ontology_Engineering_pats IS NULL + OR Machine_Learning_pats IS NULL + OR Search_Methods_pats IS NULL \ No newline at end of file diff --git a/company_linkage/sql/check_patent_have_all_ids_pre_omit.sql b/company_linkage/sql/check_patent_have_all_ids_pre_omit.sql new file mode 100644 index 00000000..3aa42d29 --- /dev/null +++ b/company_linkage/sql/check_patent_have_all_ids_pre_omit.sql @@ -0,0 +1,11 @@ +-- Check that the patent visualization data table has all the CSET organization ids in the table before +-- we run omit by rule +SELECT + COUNT(DISTINCT patent_visualization_data_with_by_year.CSET_id) = COUNT(DISTINCT aggregated_organizations.CSET_id) + AND LOGICAL_AND(patent_visualization_data_with_by_year.CSET_id IS NOT NULL) +FROM + staging_ai_companies_visualization.patent_visualization_data_with_by_year +FULL OUTER JOIN + high_resolution_entities.aggregated_organizations +USING + (CSET_id) \ No newline at end of file diff --git a/company_linkage/sql/check_visualization_have_all_ids_pre_omit.sql b/company_linkage/sql/check_visualization_have_all_ids_pre_omit.sql new file mode 100644 index 00000000..cd5532fb --- /dev/null +++ b/company_linkage/sql/check_visualization_have_all_ids_pre_omit.sql @@ -0,0 +1,11 @@ +-- Check that the visualization data table has all the CSET organization ids in the table before +-- we run omit by rule +SELECT + COUNT(DISTINCT visualization_data_with_all_papers.CSET_id) = COUNT(DISTINCT aggregated_organizations.CSET_id) + AND LOGICAL_AND(visualization_data_with_all_papers.CSET_id IS NOT NULL) +FROM + staging_ai_companies_visualization.visualization_data_with_all_papers +FULL OUTER JOIN + high_resolution_entities.aggregated_organizations +USING + (CSET_id) \ No newline at end of file diff --git a/company_linkage/sql/check_workforce_have_all_ids_pre_omit.sql b/company_linkage/sql/check_workforce_have_all_ids_pre_omit.sql new file mode 100644 index 00000000..fb9aaeb2 --- /dev/null +++ b/company_linkage/sql/check_workforce_have_all_ids_pre_omit.sql @@ -0,0 +1,11 @@ +-- Check that the workforce visualization data table has all the CSET organization ids in the table before +-- we run omit by rule +SELECT + COUNT(DISTINCT workforce_visualization_data_with_ai_jobs.CSET_id) = COUNT(DISTINCT aggregated_organizations.CSET_id) + AND LOGICAL_AND(workforce_visualization_data_with_ai_jobs.CSET_id IS NOT NULL) +FROM + staging_ai_companies_visualization.workforce_visualization_data_with_ai_jobs +FULL OUTER JOIN + high_resolution_entities.aggregated_organizations +USING + (CSET_id) \ No newline at end of file diff --git a/company_linkage/sql/creating_paper_visualization_data.sql b/company_linkage/sql/initial_paper_visualization_data.sql similarity index 61% rename from company_linkage/sql/creating_paper_visualization_data.sql rename to company_linkage/sql/initial_paper_visualization_data.sql index 5aacbb79..09c085b2 100644 --- a/company_linkage/sql/creating_paper_visualization_data.sql +++ b/company_linkage/sql/initial_paper_visualization_data.sql @@ -1,17 +1,15 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.paper_visualization_data AS WITH get_citations AS ( SELECT DISTINCT CSET_id, - refs_merged.merged_id, + references.merged_id, ref_id FROM - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers LEFT JOIN - `gcp-cset-projects.gcp_cset_links_v2.paper_references_merged` refs_merged + literature.references ON - (ai_company_pubs.merged_id = ref_id)), + (ai_company_papers.merged_id = ref_id)), add_year AS ( SELECT DISTINCT CSET_id, @@ -21,7 +19,7 @@ WITH FROM get_citations LEFT JOIN - gcp_cset_links_v2.corpus_merged + literature.papers USING (merged_id) WHERE @@ -35,8 +33,9 @@ WITH add_year GROUP BY CSET_id, - year) -SELECT + year), +all_cited as +(SELECT CSET_id, ARRAY_AGG(STRUCT(year, citation_count) @@ -45,6 +44,15 @@ SELECT FROM by_year GROUP BY - CSET_id + CSET_id) +SELECT + CSET_id, + citation_count_by_year +FROM + high_resolution_entities.aggregated_organizations +LEFT JOIN + all_cited +USING + (CSET_id) ORDER BY CSET_id \ No newline at end of file diff --git a/company_linkage/sql/creating_patent_visualization_data.sql b/company_linkage/sql/initial_patent_visualization_data.sql similarity index 96% rename from company_linkage/sql/creating_patent_visualization_data.sql rename to company_linkage/sql/initial_patent_visualization_data.sql index 134c75b8..f5ef5a68 100644 --- a/company_linkage/sql/creating_patent_visualization_data.sql +++ b/company_linkage/sql/initial_patent_visualization_data.sql @@ -1,12 +1,10 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.patent_visualization_data AS WITH aipats AS ( -- Pulling all the patents from any of our companies SELECT * FROM - ai_companies_visualization.ai_company_patents), + staging_ai_companies_visualization.ai_company_patents), pattable AS ( -- Getting the count of patents SELECT @@ -92,7 +90,7 @@ SELECT COALESCE(Machine_Learning_pats, 0) as Machine_Learning_pats, COALESCE(Search_Methods_pats, 0) as Search_Methods_pats, FROM - `gcp-cset-projects.high_resolution_entities.aggregated_organizations` AS orgs + high_resolution_entities.aggregated_organizations LEFT JOIN pattable USING diff --git a/company_linkage/sql/creating_initial_visualization_data_publications.sql b/company_linkage/sql/initial_visualization_data.sql similarity index 79% rename from company_linkage/sql/creating_initial_visualization_data_publications.sql rename to company_linkage/sql/initial_visualization_data.sql index 64097b5d..c7eb2642 100644 --- a/company_linkage/sql/creating_initial_visualization_data_publications.sql +++ b/company_linkage/sql/initial_visualization_data.sql @@ -1,12 +1,8 @@ -- This query pulls the initial visualization data for the table that doesn't have to be compiled (as it's already -- available in the organizations table) and adds in the AI publication counts. - - -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS WITH aipubs AS ( - -- Pulling all the papers with any of the given GRIDs as affiliates + -- Pulling all the papers with any of the given RORs as affiliates SELECT CSET_id, merged_id, @@ -14,8 +10,8 @@ WITH nlp, robotics FROM - ai_companies_visualization.ai_company_pubs), - gridtable AS ( + staging_ai_companies_visualization.ai_company_papers), + rortable AS ( -- Getting the count of publications SELECT CSET_id, @@ -41,7 +37,7 @@ SELECT market, crunchbase, child_crunchbase, - grid, + ror_id, linkedin, in_sandp_500, in_fortune_global_500, @@ -50,8 +46,8 @@ SELECT COALESCE(nlp_pubs, 0) as nlp_pubs, COALESCE(robotics_pubs, 0) as robotics_pubs FROM - `gcp-cset-projects.high_resolution_entities.aggregated_organizations` AS orgs + high_resolution_entities.aggregated_organizations LEFT JOIN - gridtable + rortable USING (CSET_id) \ No newline at end of file diff --git a/company_linkage/sql/creating_workforce_visualization_data.sql b/company_linkage/sql/initial_workforce_visualization_data.sql similarity index 61% rename from company_linkage/sql/creating_workforce_visualization_data.sql rename to company_linkage/sql/initial_workforce_visualization_data.sql index 70620d57..f6cdbec3 100644 --- a/company_linkage/sql/creating_workforce_visualization_data.sql +++ b/company_linkage/sql/initial_workforce_visualization_data.sql @@ -1,26 +1,25 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.workforce_visualization_data AS WITH clean_linkedins AS ( SELECT DISTINCT cset_id, name, - REPLACE(linkedins, "https://www.", "http://") AS linkedin + REPLACE(REPLACE(linkedins, "https://www.", ""), "http://www.", "") AS linkedin FROM high_resolution_entities.aggregated_organizations CROSS JOIN - UNNEST (linkedin) AS linkedins) -SELECT + UNNEST (linkedin) AS linkedins), +job_info as +(SELECT DISTINCT cset_id, COUNT(DISTINCT user_id) AS tt1_jobs FROM clean_linkedins LEFT JOIN - `gcp-cset-projects.gcp_cset_revelio.position` position + revelio.individual_position ON - linkedin = company_li_url + linkedin = company_linkedin_url INNER JOIN - gcp_cset_revelio.role_lookup + revelio.role_lookup USING (mapped_role) INNER JOIN @@ -28,12 +27,12 @@ INNER JOIN ON (k1000 = role_k1000) LEFT JOIN - gcp_cset_revelio.education + revelio.individual_education USING (user_id) WHERE - (position.enddate IS NULL - OR position.enddate > CURRENT_DATE ()) + (individual_position.enddate IS NULL + OR individual_position.enddate > CURRENT_DATE ()) AND (ba_req IS FALSE OR ((degree = "Bachelor" OR degree = "Master" @@ -43,6 +42,16 @@ WHERE OR ((degree = "Doctor") AND REGEXP_CONTAINS(field_raw, r'(?i)(computer\s+science|computer\s+engineering|electrical\s+engineering)'))) GROUP BY - cset_id + cset_id) +SELECT + DISTINCT + cset_id, + COALESCE(tt1_jobs, 0) as tt1_jobs +FROM + high_resolution_entities.aggregated_organizations +LEFT JOIN + job_info +USING + (cset_id) ORDER BY cset_id \ No newline at end of file diff --git a/company_linkage/sql/selecting_ai_patents.sql b/company_linkage/sql/linked_ai_patents.sql similarity index 92% rename from company_linkage/sql/selecting_ai_patents.sql rename to company_linkage/sql/linked_ai_patents.sql index e5db1fff..ee7360b2 100644 --- a/company_linkage/sql/selecting_ai_patents.sql +++ b/company_linkage/sql/linked_ai_patents.sql @@ -1,16 +1,15 @@ -- Pulling every AI-associated patent family id linked to every grid id of any assignee for that patent, and all the assignee names -- We also pull in the AI subcategories and the years -- We also attempt to add in "fake" families for the patents that are missing patent families -create or replace table ai_companies_visualization.linked_ai_patents as with patents_orig as ( SELECT - -- Pulling in the current assignee grid ids from dimensions + -- Pulling in the current assignee ror ids from dimensions patent_id, family_id, assignee, - grid + ror_id FROM - `gcp-cset-projects.unified_patents.normalized_patent_assignees`), + unified_patents.assignees_normalized), all_ai as ( -- Selecting all the family ids and patent IDs to get AI patents -- Also select the year so we can get counts by year @@ -51,13 +50,13 @@ all_ai as ( Machine_Learning, Search_Methods FROM - gcp-cset-projects.unified_patents.ai_patents), + unified_patents.ai_patents), patent_years as ( SELECT patent_id, EXTRACT(year FROM first_priority_date) as priority_year FROM - gcp-cset-projects.unified_patents.patent_dates + unified_patents.dates ) SELECT DISTINCT @@ -66,7 +65,7 @@ all_ai as ( -- We're just doing this so our counts aren't blank COALESCE(family_id, "X-" || patent_id) as family_id, assignee, - grid, + ror_id, MIN(priority_year) as priority_year, LOGICAL_OR(Physical_Sciences_and_Engineering) as Physical_Sciences_and_Engineering, LOGICAL_OR(Life_Sciences) as Life_Sciences, @@ -111,6 +110,6 @@ all_ai as ( USING (patent_id)) WHERE priority_year IS NOT NULL GROUP BY - grid, + ror_id, assignee, family_id \ No newline at end of file diff --git a/company_linkage/sql/merged_ai_papers.sql b/company_linkage/sql/merged_ai_papers.sql deleted file mode 100644 index 9f3478f2..00000000 --- a/company_linkage/sql/merged_ai_papers.sql +++ /dev/null @@ -1,13 +0,0 @@ -CREATE OR REPLACE TABLE - `gcp-cset-projects.ai_companies_visualization.ai_company_pubs` AS -SELECT - DISTINCT * -FROM - `gcp-cset-projects.ai_companies_visualization.ai_company_pubs` -UNION DISTINCT -SELECT - DISTINCT * -FROM - `gcp-cset-projects.ai_companies_visualization.ai_company_pubs_no_grid` -ORDER BY - id \ No newline at end of file diff --git a/company_linkage/sql/omit_by_rule_papers.sql b/company_linkage/sql/omit_by_rule_papers.sql deleted file mode 100644 index 00e03c81..00000000 --- a/company_linkage/sql/omit_by_rule_papers.sql +++ /dev/null @@ -1,23 +0,0 @@ -CREATE OR REPLACE TABLE - `gcp-cset-projects.ai_companies_visualization.paper_visualization_data` AS - -- Selecting the companies we want to leave out -WITH - to_omit AS ( - SELECT - CSET_id - FROM - ai_companies_visualization.visualization_data - RIGHT JOIN - ai_companies_visualization.paper_visualization_data - USING (cset_id) - WHERE visualization_data.cset_id IS NULL) -SELECT - * -FROM - `gcp-cset-projects.ai_companies_visualization.paper_visualization_data` -WHERE - CSET_id NOT IN ( - SELECT - * - FROM - to_omit) \ No newline at end of file diff --git a/company_linkage/sql/omit_by_rule_patents.sql b/company_linkage/sql/omit_by_rule_patents.sql deleted file mode 100644 index 8781112e..00000000 --- a/company_linkage/sql/omit_by_rule_patents.sql +++ /dev/null @@ -1,23 +0,0 @@ -CREATE OR REPLACE TABLE - `gcp-cset-projects.ai_companies_visualization.patent_visualization_data` AS - -- Selecting the companies we want to leave out -WITH - to_omit AS ( - SELECT - CSET_id - FROM - ai_companies_visualization.visualization_data - RIGHT JOIN - ai_companies_visualization.patent_visualization_data - USING (cset_id) - WHERE visualization_data.cset_id IS NULL) -SELECT - * -FROM - `gcp-cset-projects.ai_companies_visualization.patent_visualization_data` -WHERE - CSET_id NOT IN ( - SELECT - * - FROM - to_omit) \ No newline at end of file diff --git a/company_linkage/sql/omit_by_rule_workforce.sql b/company_linkage/sql/omit_by_rule_workforce.sql deleted file mode 100644 index 40a25b3c..00000000 --- a/company_linkage/sql/omit_by_rule_workforce.sql +++ /dev/null @@ -1,23 +0,0 @@ -CREATE OR REPLACE TABLE - `gcp-cset-projects.ai_companies_visualization.workforce_visualization_data` AS - -- Selecting the companies we want to leave out -WITH - to_omit AS ( - SELECT - CSET_id - FROM - ai_companies_visualization.visualization_data - RIGHT JOIN - ai_companies_visualization.workforce_visualization_data - USING (cset_id) - WHERE visualization_data.cset_id IS NULL) -SELECT - * -FROM - `gcp-cset-projects.ai_companies_visualization.workforce_visualization_data` -WHERE - CSET_id NOT IN ( - SELECT - * - FROM - to_omit) \ No newline at end of file diff --git a/company_linkage/sql/omitting_companies.sql b/company_linkage/sql/omitting_companies.sql deleted file mode 100644 index 1d5d3cef..00000000 --- a/company_linkage/sql/omitting_companies.sql +++ /dev/null @@ -1,15 +0,0 @@ --- DEPRECATED, REMOVE WHEN READY --- We want to omit companies from the visualization -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS -SELECT - * -FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data` - -- Omitting companies based on list -WHERE - CSET_id NOT IN ( - SELECT - * - FROM - ai_companies_visualization.omit) \ No newline at end of file diff --git a/company_linkage/sql/create_organizations_from_airtable_imports.sql b/company_linkage/sql/organizations.sql similarity index 87% rename from company_linkage/sql/create_organizations_from_airtable_imports.sql rename to company_linkage/sql/organizations.sql index 0fa42f1d..b93bf8b1 100644 --- a/company_linkage/sql/create_organizations_from_airtable_imports.sql +++ b/company_linkage/sql/organizations.sql @@ -1,5 +1,3 @@ -CREATE OR REPLACE TABLE - high_resolution_entities.organizations AS SELECT * REPLACE( ( SELECT @@ -37,7 +35,7 @@ FROM ( organizations_joined.name, STRUCT(city, province_state, - country) AS location, + organizations_joined.country) AS location, website, ARRAY_AGG(STRUCT(alias_language, alias)) AS aliases, @@ -54,9 +52,9 @@ FROM ( ticker)) AS market, STRUCT(crunchbase_uuid, crunchbase_url) AS crunchbase, - ARRAY_AGG(DISTINCT grid IGNORE NULLS) AS grid, + ARRAY_AGG(DISTINCT ror.id IGNORE NULLS) AS ror_id, regex, - ARRAY_AGG(DISTINCT bgov IGNORE NULLS) AS BGOV_id, + ARRAY_AGG(DISTINCT bgov_id IGNORE NULLS) AS BGOV_id, linkedin, CASE WHEN in_sandp_500 IS TRUE THEN TRUE @@ -70,7 +68,7 @@ FROM ( FALSE END AS in_fortune_global_500, - comment + ids_joined.comment FROM parat_input.organizations_joined LEFT JOIN @@ -105,12 +103,16 @@ FROM ( parat_input.linkedin_joined USING (CSET_id) + LEFT JOIN + gcp_cset_ror.ror + ON + grid_joined.grid = external_ids.GRID.all GROUP BY CSET_id, name, city, province_state, - country, + organizations_joined.country, website, crunchbase_uuid, crunchbase_url, diff --git a/company_linkage/sql/paper_visualization_data.sql b/company_linkage/sql/paper_visualization_data.sql new file mode 100644 index 00000000..d71afe2e --- /dev/null +++ b/company_linkage/sql/paper_visualization_data.sql @@ -0,0 +1,26 @@ + -- Selecting the companies we want to leave out + -- Essentially, visualization_data_omit_by_rule contains all the companies that we want + -- to retain after the omit_by_rule process has been applied + -- So, here, in to_omit, we select any company that isn't found in that table as a + -- company we'd like to omit, replicating the rule-based omission. + -- This allows us to omit the same set of companies across all of our tables. +WITH + to_omit AS ( + SELECT + CSET_id + FROM + staging_ai_companies_visualization.visualization_data_omit_by_rule + RIGHT JOIN + staging_ai_companies_visualization.paper_visualization_data_with_methods + USING (cset_id) + WHERE visualization_data_omit_by_rule.cset_id IS NULL) +SELECT + * +FROM + staging_ai_companies_visualization.paper_visualization_data_with_methods +WHERE + CSET_id NOT IN ( + SELECT + * + FROM + to_omit) \ No newline at end of file diff --git a/company_linkage/sql/adding_top_science_map_clusters.sql b/company_linkage/sql/paper_visualization_data_with_clusters.sql similarity index 69% rename from company_linkage/sql/adding_top_science_map_clusters.sql rename to company_linkage/sql/paper_visualization_data_with_clusters.sql index 320490ea..637843b2 100644 --- a/company_linkage/sql/adding_top_science_map_clusters.sql +++ b/company_linkage/sql/paper_visualization_data_with_clusters.sql @@ -1,5 +1,3 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.paper_visualization_data AS WITH company_cluster_assignment AS ( SELECT @@ -7,9 +5,9 @@ WITH merged_id, cluster_id FROM - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers LEFT JOIN - `gcp-cset-projects.science_map_v2.dc5_cluster_assignment_stable` + map_of_science.cluster_assignment USING (merged_id) WHERE @@ -36,10 +34,10 @@ WITH GROUP BY CSET_id) SELECT - paper_visualization_data.*, + paper_visualization_data_with_mag.*, clusters FROM - ai_companies_visualization.paper_visualization_data + staging_ai_companies_visualization.paper_visualization_data_with_mag LEFT JOIN aggregated_clusters USING diff --git a/company_linkage/sql/adding_company_references.sql b/company_linkage/sql/paper_visualization_data_with_company_references.sql similarity index 74% rename from company_linkage/sql/adding_company_references.sql rename to company_linkage/sql/paper_visualization_data_with_company_references.sql index 2b49b118..f935f350 100644 --- a/company_linkage/sql/adding_company_references.sql +++ b/company_linkage/sql/paper_visualization_data_with_company_references.sql @@ -1,5 +1,3 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.paper_visualization_data AS -- First get all the articles cited by the AI papers written by our companies WITH get_references AS ( @@ -8,9 +6,9 @@ WITH merged_id, ref_id FROM - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers LEFT JOIN - `gcp-cset-projects.gcp_cset_links_v2.paper_references_merged` + literature.references USING (merged_id)), referenced_companies AS ( @@ -18,13 +16,13 @@ WITH DISTINCT get_references.CSET_id, get_references.merged_id, ref_id, - ai_company_pubs.CSET_id AS ref_CSET_id + ai_company_papers.CSET_id AS ref_CSET_id FROM get_references INNER JOIN - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers ON - ref_id = ai_company_pubs.merged_id + ref_id = ai_company_papers.merged_id ORDER BY CSET_id), count_company_refs AS ( @@ -54,10 +52,10 @@ GROUP BY ORDER BY CSET_id) SELECT - paper_visualization_data.*, + paper_visualization_data_with_clusters.*, company_references FROM - ai_companies_visualization.paper_visualization_data + staging_ai_companies_visualization.paper_visualization_data_with_clusters LEFT JOIN aggregated_refs USING diff --git a/company_linkage/sql/adding_top_mag_ai_fields.sql b/company_linkage/sql/paper_visualization_data_with_mag.sql similarity index 82% rename from company_linkage/sql/adding_top_mag_ai_fields.sql rename to company_linkage/sql/paper_visualization_data_with_mag.sql index 7223d747..46e93f76 100644 --- a/company_linkage/sql/adding_top_mag_ai_fields.sql +++ b/company_linkage/sql/paper_visualization_data_with_mag.sql @@ -1,19 +1,17 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.paper_visualization_data AS WITH names AS ( SELECT field_id AS child_field_id, name FROM - `gcp-cset-projects.fields_of_study.field_meta`), + fields_of_study.field_meta), ai_subfields AS ( SELECT field_id, child_field_id, name AS child_name FROM - `gcp-cset-projects.fields_of_study.field_children` + fields_of_study.field_children LEFT JOIN names USING @@ -43,7 +41,7 @@ WITH field.id AS field_id, field.name AS field_name FROM - `gcp-cset-projects.fields_of_study.top_fields` + fields_of_study.top_fields CROSS JOIN UNNEST(fields) AS field INNER JOIN @@ -59,7 +57,7 @@ WITH field_id, field_name FROM - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers LEFT JOIN articles_with_ai_subfields USING @@ -88,10 +86,10 @@ WITH GROUP BY CSET_id) SELECT - paper_visualization_data.*, + initial_paper_visualization_data.*, fields FROM - ai_companies_visualization.paper_visualization_data + staging_ai_companies_visualization.initial_paper_visualization_data LEFT JOIN aggregated_fields USING diff --git a/company_linkage/sql/adding_top_methods.sql b/company_linkage/sql/paper_visualization_data_with_methods.sql similarity index 74% rename from company_linkage/sql/adding_top_methods.sql rename to company_linkage/sql/paper_visualization_data_with_methods.sql index 853177cd..e561d138 100644 --- a/company_linkage/sql/adding_top_methods.sql +++ b/company_linkage/sql/paper_visualization_data_with_methods.sql @@ -1,12 +1,10 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.paper_visualization_data AS WITH articles_with_ai_methods AS ( SELECT DISTINCT merged_id, referent, FROM - `gcp-cset-projects.tasks_and_methods.method_referents` + tasks_and_methods.method_referents CROSS JOIN UNNEST(referents) AS referent), company_articles_with_methods AS ( @@ -15,7 +13,7 @@ WITH merged_id, referent FROM - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers LEFT JOIN articles_with_ai_methods USING @@ -44,10 +42,10 @@ WITH GROUP BY CSET_id) SELECT - paper_visualization_data.*, + paper_visualization_data_with_tasks.*, methods FROM - ai_companies_visualization.paper_visualization_data + staging_ai_companies_visualization.paper_visualization_data_with_tasks LEFT JOIN aggregated_fields USING diff --git a/company_linkage/sql/adding_top_tasks.sql b/company_linkage/sql/paper_visualization_data_with_tasks.sql similarity index 74% rename from company_linkage/sql/adding_top_tasks.sql rename to company_linkage/sql/paper_visualization_data_with_tasks.sql index afaf256c..eed2f588 100644 --- a/company_linkage/sql/adding_top_tasks.sql +++ b/company_linkage/sql/paper_visualization_data_with_tasks.sql @@ -1,12 +1,10 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.paper_visualization_data AS WITH articles_with_ai_tasks AS ( SELECT DISTINCT merged_id, referent, FROM - `gcp-cset-projects.tasks_and_methods.task_referents` + tasks_and_methods.task_referents CROSS JOIN UNNEST(referents) AS referent), company_articles_with_tasks AS ( @@ -15,7 +13,7 @@ WITH merged_id, referent FROM - ai_companies_visualization.ai_company_pubs + staging_ai_companies_visualization.ai_company_papers LEFT JOIN articles_with_ai_tasks USING @@ -44,10 +42,10 @@ WITH GROUP BY CSET_id) SELECT - paper_visualization_data.*, + paper_visualization_data_with_company_references.*, tasks FROM - ai_companies_visualization.paper_visualization_data + staging_ai_companies_visualization.paper_visualization_data_with_company_references LEFT JOIN aggregated_fields USING diff --git a/company_linkage/sql/patent_visualization_data.sql b/company_linkage/sql/patent_visualization_data.sql new file mode 100644 index 00000000..8047506a --- /dev/null +++ b/company_linkage/sql/patent_visualization_data.sql @@ -0,0 +1,26 @@ + -- Selecting the companies we want to leave out + -- Essentially, visualization_data_omit_by_rule contains all the companies that we want + -- to retain after the omit_by_rule process has been applied + -- So, here, in to_omit, we select any company that isn't found in that table as a + -- company we'd like to omit, replicating the rule-based omission. + -- This allows us to omit the same set of companies across all of our tables. +WITH + to_omit AS ( + SELECT + CSET_id + FROM + staging_ai_companies_visualization.visualization_data_omit_by_rule + RIGHT JOIN + staging_ai_companies_visualization.patent_visualization_data_with_by_year + USING (cset_id) + WHERE visualization_data_omit_by_rule.cset_id IS NULL) +SELECT + * +FROM + staging_ai_companies_visualization.patent_visualization_data_with_by_year +WHERE + CSET_id NOT IN ( + SELECT + * + FROM + to_omit) \ No newline at end of file diff --git a/company_linkage/sql/adding_ai_patents_by_year_to_visualization.sql b/company_linkage/sql/patent_visualization_data_with_by_year.sql similarity index 96% rename from company_linkage/sql/adding_ai_patents_by_year_to_visualization.sql rename to company_linkage/sql/patent_visualization_data_with_by_year.sql index 5804c8f9..06ed1457 100644 --- a/company_linkage/sql/adding_ai_patents_by_year_to_visualization.sql +++ b/company_linkage/sql/patent_visualization_data_with_by_year.sql @@ -1,12 +1,10 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.patent_visualization_data AS WITH aipats AS ( -- Pulling all the patents from any of our companies SELECT * FROM - ai_companies_visualization.ai_company_patents), + staging_ai_companies_visualization.ai_company_patents), pattable AS ( -- Getting the count of patents SELECT @@ -202,7 +200,7 @@ WITH priority_year) AS Search_Methods_pats_by_year, FROM - `gcp-cset-projects.high_resolution_entities.aggregated_organizations` AS orgs + high_resolution_entities.aggregated_organizations LEFT JOIN pattable USING @@ -215,7 +213,7 @@ SELECT viz.*, by_year.* EXCEPT (CSET_id) FROM - `gcp-cset-projects.ai_companies_visualization.patent_visualization_data` AS viz + staging_ai_companies_visualization.initial_patent_visualization_data AS viz LEFT JOIN by_year USING diff --git a/company_linkage/sql/pulling_publications_in_top_ai_conferences.sql b/company_linkage/sql/pubs_in_top_conferences.sql similarity index 88% rename from company_linkage/sql/pulling_publications_in_top_ai_conferences.sql rename to company_linkage/sql/pubs_in_top_conferences.sql index 08487408..3b23ee19 100644 --- a/company_linkage/sql/pulling_publications_in_top_ai_conferences.sql +++ b/company_linkage/sql/pubs_in_top_conferences.sql @@ -1,5 +1,3 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.pubs_in_top_conferences AS WITH -- Associating GRIDs to the merged paper ids affils AS ( @@ -29,7 +27,7 @@ SELECT ror_id, year FROM - ai_companies_visualization.top_conference_pubs AS top_pubs + staging_ai_companies_visualization.top_conference_pubs AS top_pubs -- We're inner joining because if there's no affiliate information at all we have no way to even evaluate this data for our purposes INNER JOIN affils diff --git a/company_linkage/sql/selecting_ai_publications.sql b/company_linkage/sql/selecting_ai_publications.sql deleted file mode 100644 index 6877d104..00000000 --- a/company_linkage/sql/selecting_ai_publications.sql +++ /dev/null @@ -1,61 +0,0 @@ - -- Pulling every AI-associated publication id linked to every grid id and every organization name - -- We also include years because we'll want those later for yearly counts - -- and cv/robotics/nlp so we can filter on these -CREATE OR REPLACE TABLE - ai_companies_visualization.ai_publications AS -WITH - ai_papers AS ( - SELECT - cset_id AS merged_id, - cv_filtered, - nlp_filtered, - robotics_filtered - FROM - gcp-cset-projects.article_classification.predictions - WHERE - ai_filtered = TRUE OR cv_filtered = TRUE OR nlp_filtered = TRUE OR robotics_filtered = TRUE), - gr AS ( - -- Adding in org names and country data using GRID - SELECT - id, - name AS org_name, - country_name AS country - FROM - gcp-cset-projects.gcp_cset_grid.api_grid), - merged_grids AS ( - -- Selecting all the merged ids and grid ids from the links table - SELECT - DISTINCT - merged_id, - grid_id, - org_name, - cv_filtered as cv, - nlp_filtered as nlp, - robotics_filtered as robotics - FROM - `gcp-cset-projects.gcp_cset_links_v2.paper_affiliations_merged` - -- if they're AI papers - INNER JOIN ai_papers - USING (merged_id)), - article_years AS ( - SELECT - merged_id, - year - FROM - `gcp-cset-projects.gcp_cset_links_v2.corpus_merged`) -SELECT - -- Adding in the org name and country associated with the grid id - merged_grids.* EXCEPT (org_name), - COALESCE(gr.org_name, merged_grids.org_name) as org_name, - country, - year -FROM - merged_grids -LEFT JOIN - gr -ON - merged_grids.Grid_ID = gr.id -LEFT JOIN - article_years -ON - merged_grids.merged_id = article_years.merged_id \ No newline at end of file diff --git a/company_linkage/sql/selecting_top_conference_pubs.sql b/company_linkage/sql/top_conference_pubs.sql similarity index 97% rename from company_linkage/sql/selecting_top_conference_pubs.sql rename to company_linkage/sql/top_conference_pubs.sql index af563615..25993325 100644 --- a/company_linkage/sql/selecting_top_conference_pubs.sql +++ b/company_linkage/sql/top_conference_pubs.sql @@ -1,5 +1,3 @@ -CREATE OR REPLACE TABLE - ai_companies_visualization.top_conference_pubs AS WITH venues AS ( SELECT diff --git a/company_linkage/sql/adding_crunchbase_company_metadata.sql b/company_linkage/sql/visualization_data.sql similarity index 88% rename from company_linkage/sql/adding_crunchbase_company_metadata.sql rename to company_linkage/sql/visualization_data.sql index 1f48d965..ad7022b7 100644 --- a/company_linkage/sql/adding_crunchbase_company_metadata.sql +++ b/company_linkage/sql/visualization_data.sql @@ -1,14 +1,12 @@ -- We're adding useful Crunchbase data to the visualization: descriptions, logos, and the company's "stage" -- (which we're using as a proxy for its size/growth but is actually based on what funding it has received). -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS WITH -- Pull in all the visualization data, most importantly including the crunchbase uuid that will be used to connect to everything else visualization AS ( SELECT * FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data`), + staging_ai_companies_visualization.visualization_data_omit_by_rule), -- Grab the descriptions and logos from Crunchbase ODM odm_data AS ( SELECT @@ -16,7 +14,7 @@ WITH short_description, logo_url FROM - `gcp-cset-projects.gcp_cset_crunchbase.organizations_odm`), + gcp_cset_crunchbase.organizations_odm), -- Grab the raw stage data for companies -- Since companies have multiple funding rounds they may have multiple rows! -- We need to deal with this @@ -51,7 +49,7 @@ WITH END AS stage FROM - `gcp-cset-projects.gcp_cset_crunchbase.funding_rounds`), + gcp_cset_crunchbase.funding_rounds), -- Now we want only one stage value to come out for any given company -- If a company has ever been mature, it's no longer growth or startup, etc. -- So there's a clear hierarchy, and we take the max @@ -78,9 +76,9 @@ WITH FROM combine_stages LEFT JOIN - gcp_cset_crunchbase.organizations orgs + gcp_cset_crunchbase.organizations ON - combine_stages.org_uuid = orgs.uuid ), + combine_stages.org_uuid = organizations.uuid ), stage_name AS ( SELECT org_uuid, @@ -111,7 +109,7 @@ FROM LEFT JOIN odm_data ON - visualization.crunchbase.crunchbase_uuid = odm_data.uuid + TRIM(visualization.crunchbase.crunchbase_uuid) = TRIM(odm_data.uuid) LEFT JOIN stage_name ON diff --git a/company_linkage/sql/omit_by_rule.sql b/company_linkage/sql/visualization_data_omit_by_rule.sql similarity index 73% rename from company_linkage/sql/omit_by_rule.sql rename to company_linkage/sql/visualization_data_omit_by_rule.sql index 597e871c..bef7cb15 100644 --- a/company_linkage/sql/omit_by_rule.sql +++ b/company_linkage/sql/visualization_data_omit_by_rule.sql @@ -1,17 +1,15 @@ -CREATE OR REPLACE TABLE - `gcp-cset-projects.ai_companies_visualization.visualization_data` AS -- Selecting the companies we want to leave out WITH to_omit AS ( SELECT CSET_id FROM - ai_companies_visualization.visualization_data + staging_ai_companies_visualization.visualization_data_with_all_papers LEFT JOIN - ai_companies_visualization.patent_visualization_data + staging_ai_companies_visualization.patent_visualization_data_with_by_year USING (cset_id) LEFT JOIN - ai_companies_visualization.workforce_visualization_data + staging_ai_companies_visualization.workforce_visualization_data_with_ai_jobs USING (cset_id) WHERE @@ -32,7 +30,7 @@ WITH SELECT * FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data` + staging_ai_companies_visualization.visualization_data_with_all_papers WHERE CSET_id NOT IN ( SELECT diff --git a/company_linkage/sql/adding_all_paper_counts.sql b/company_linkage/sql/visualization_data_with_all_papers.sql similarity index 51% rename from company_linkage/sql/adding_all_paper_counts.sql rename to company_linkage/sql/visualization_data_with_all_papers.sql index 073a6dce..67901809 100644 --- a/company_linkage/sql/adding_all_paper_counts.sql +++ b/company_linkage/sql/visualization_data_with_all_papers.sql @@ -1,6 +1,4 @@ -- Update the visualization table itself to add total paper data -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS -- Pull in the total paper counts, along with the CSET ids to link them in WITH count_data AS ( @@ -9,14 +7,13 @@ WITH all_pubs, all_pubs_by_year, FROM - `gcp-cset-projects.ai_companies_visualization.total_paper_counts`), - -- Pull in the current visualization data. Exclude the all_paper data, since that was included when we built the all paper data, so we don't need it + staging_ai_companies_visualization.all_paper_counts), + -- Pull in the current visualization data viz_data AS ( SELECT - * EXCEPT(all_pubs, - all_pubs_by_year) + * FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data`) + staging_ai_companies_visualization.visualization_data_with_top_papers) -- Join the two together using the CSET id SELECT viz_data.*, diff --git a/company_linkage/sql/adding_ai_pubs_by_year_to_visualization.sql b/company_linkage/sql/visualization_data_with_by_year.sql similarity index 80% rename from company_linkage/sql/adding_ai_pubs_by_year_to_visualization.sql rename to company_linkage/sql/visualization_data_with_by_year.sql index 51db60ec..d229b93f 100644 --- a/company_linkage/sql/adding_ai_pubs_by_year_to_visualization.sql +++ b/company_linkage/sql/visualization_data_with_by_year.sql @@ -1,7 +1,5 @@ -- Adding AI publication data by year to the visualization table -- This uses the same mechanism as adding AI publication counts; we're just doing it on a by-year basis -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS WITH aipubs AS ( -- Pulling all the papers with any of the given GRIDs as affiliates @@ -13,8 +11,8 @@ WITH nlp, robotics FROM - ai_companies_visualization.ai_company_pubs), - gridtable AS ( + staging_ai_companies_visualization.ai_company_papers), + rortable AS ( -- Getting the count of publications SELECT CSET_id, @@ -49,23 +47,23 @@ WITH ORDER BY year) AS robotics_pubs_by_year, FROM - `gcp-cset-projects.high_resolution_entities.aggregated_organizations` AS orgs + high_resolution_entities.aggregated_organizations LEFT JOIN - gridtable + rortable USING (CSET_id) GROUP BY CSET_id) SELECT - viz.*, + initial_visualization_data.*, ai_pubs_by_year, cv_pubs_by_year, nlp_pubs_by_year, robotics_pubs_by_year FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data` AS viz + staging_ai_companies_visualization.initial_visualization_data LEFT JOIN by_year ON - viz.CSET_id = by_year.CSET_id + initial_visualization_data.CSET_id = by_year.CSET_id ORDER BY cset_id \ No newline at end of file diff --git a/company_linkage/sql/adding_top_paper_counts.sql b/company_linkage/sql/visualization_data_with_top_papers.sql similarity index 57% rename from company_linkage/sql/adding_top_paper_counts.sql rename to company_linkage/sql/visualization_data_with_top_papers.sql index cf5e0d47..7277bd51 100644 --- a/company_linkage/sql/adding_top_paper_counts.sql +++ b/company_linkage/sql/visualization_data_with_top_papers.sql @@ -1,6 +1,4 @@ -- Update the visualization table itself to add top paper data -CREATE OR REPLACE TABLE - ai_companies_visualization.visualization_data AS -- Pull in the top paper counts, along with the CSET ids to link them in WITH count_data AS ( @@ -9,13 +7,13 @@ WITH ai_pubs_in_top_conferences, ai_pubs_in_top_conferences_by_year, FROM - `gcp-cset-projects.ai_companies_visualization.top_paper_counts`), - -- Pull in the current visualization data. Exclude the ai_pubs_in_top_conferences data, since that was included when we built the top paper data, so we don't need it + staging_ai_companies_visualization.top_paper_counts), + -- Pull in the current visualization data. viz_data AS ( SELECT * FROM - `gcp-cset-projects.ai_companies_visualization.visualization_data`) + staging_ai_companies_visualization.visualization_data_with_by_year) -- Join the two together using the CSET id SELECT viz_data.*, diff --git a/company_linkage/sql/workforce_visualization_data.sql b/company_linkage/sql/workforce_visualization_data.sql new file mode 100644 index 00000000..e5cc4f8e --- /dev/null +++ b/company_linkage/sql/workforce_visualization_data.sql @@ -0,0 +1,26 @@ + -- Selecting the companies we want to leave out + -- Essentially, visualization_data_omit_by_rule contains all the companies that we want + -- to retain after the omit_by_rule process has been applied + -- So, here, in to_omit, we select any company that isn't found in that table as a + -- company we'd like to omit, replicating the rule-based omission. + -- This allows us to omit the same set of companies across all of our tables. +WITH + to_omit AS ( + SELECT + CSET_id + FROM + staging_ai_companies_visualization.visualization_data_omit_by_rule + RIGHT JOIN + staging_ai_companies_visualization.workforce_visualization_data_with_ai_jobs + USING (cset_id) + WHERE visualization_data_omit_by_rule.cset_id IS NULL) +SELECT + * +FROM + staging_ai_companies_visualization.workforce_visualization_data_with_ai_jobs +WHERE + CSET_id NOT IN ( + SELECT + * + FROM + to_omit) \ No newline at end of file diff --git a/company_linkage/sql/adding_ai_jobs_to_workforce_visualization.sql b/company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql similarity index 73% rename from company_linkage/sql/adding_ai_jobs_to_workforce_visualization.sql rename to company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql index 3f0c174c..a6981174 100644 --- a/company_linkage/sql/adding_ai_jobs_to_workforce_visualization.sql +++ b/company_linkage/sql/workforce_visualization_data_with_ai_jobs.sql @@ -1,10 +1,9 @@ -create or replace table ai_companies_visualization.workforce_visualization_data as WITH clean_linkedins AS ( SELECT DISTINCT cset_id, name, - REPLACE(linkedins, "https://www.", "http://") AS linkedin + REPLACE(REPLACE(linkedins, "https://www.", ""), "http://www.", "") AS linkedin FROM high_resolution_entities.aggregated_organizations CROSS JOIN @@ -12,15 +11,15 @@ WITH new_ai_jobs AS ( SELECT DISTINCT cset_id, - COUNT(DISTINCT user_id) AS ai_jobs + COUNT(DISTINCT individual_position.user_id) AS ai_jobs FROM clean_linkedins INNER JOIN - `gcp-cset-projects.gcp_cset_revelio.position` position + revelio.individual_position ON - linkedin = company_li_url + linkedin = company_linkedin_url INNER JOIN - gcp_cset_revelio.role_lookup + revelio.role_lookup USING (mapped_role) INNER JOIN @@ -28,12 +27,16 @@ WITH ON (k1000 = role_k1000) LEFT JOIN - gcp_cset_revelio.education + revelio.individual_education USING (user_id) + LEFT JOIN + revelio.individual_position_descriptions + USING + (position_id) WHERE - (position.enddate IS NULL - OR position.enddate > CURRENT_DATE()) + (individual_position.enddate IS NULL + OR individual_position.enddate > CURRENT_DATE()) AND (ba_req IS FALSE OR ((degree = "Bachelor" OR degree = "Master" @@ -52,7 +55,7 @@ SELECT tt1_jobs, COALESCE(ai_jobs, 0) as ai_jobs FROM - ai_companies_visualization.workforce_visualization_data + staging_ai_companies_visualization.initial_workforce_visualization_data LEFT JOIN new_ai_jobs USING