georgetown-cset · jmelot · Feb 6, 2024 · Jan 30, 2024
diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py
@@ -212,7 +212,7 @@
     )
 
     run_papers = []
-    for paper_type in ["top_paper", "all_paper", "all_patent"]:
+    for paper_type in ["top_paper", "highly_cited_paper", "all_paper", "all_patent"]:
 
         run_get_paper_counts = GKEStartPodOperator(
             task_id=f"run_get_{paper_type}_counts",
@@ -262,6 +262,17 @@
         write_disposition="WRITE_TRUNCATE"
     )
 
+    load_highly_cited_papers = GCSToBigQueryOperator(
+        task_id=f"load_highly_cited_papers",
+        bucket=DATA_BUCKET,
+        source_objects=[f"{tmp_dir}/highly_cited_paper/highly_cited_paper_counts.jsonl"],
+        schema_object=f"{schema_dir}/highly_cited_papers_schema.json",
+        destination_project_dataset_table=f"{staging_dataset}.highly_cited_paper_counts",
+        source_format="NEWLINE_DELIMITED_JSON",
+        create_disposition="CREATE_IF_NEEDED",
+        write_disposition="WRITE_TRUNCATE"
+    )
+
     load_all_papers = GCSToBigQueryOperator(
         task_id=f"load_all_papers",
         bucket=DATA_BUCKET,
@@ -376,6 +387,7 @@
         >> load_ai_patent_grants
         >> run_papers
         >> load_top_papers
+        >> load_highly_cited_papers
         >> load_all_papers
         >> load_all_patents
         >> start_visualization_tables

diff --git a/company_linkage/parat_scripts/highly_cited_papers.py b/company_linkage/parat_scripts/highly_cited_papers.py
@@ -0,0 +1,28 @@
+import argparse
+
+from get_ai_counts import CountGetter
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("output_file", type=str,
+                        help="A jsonl file for writing output data to create new tables")
+    args = parser.parse_args()
+    if not args.output_file:
+        parser.print_help()
+        return
+    if "jsonl" not in args.output_file:
+        parser.print_help()
+        return
+    paper_finder = CountGetter()
+    paper_finder.get_identifiers()
+    # These are the only two lines that make this different from running AI pubs
+    # We select from a different table
+    table_name = "staging_ai_companies_visualization.highly_cited_ai_publications"
+    # And we write out our data to a different variable
+    companies = paper_finder.run_query_papers(table_name, "highly_cited_ai_pubs", by_year=True)
+    paper_finder.write_output(companies, args.output_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/company_linkage/schemas/highly_cited_papers_schema.json b/company_linkage/schemas/highly_cited_papers_schema.json
@@ -0,0 +1,34 @@
+[
+      {
+        "mode": "REQUIRED",
+        "name": "CSET_id",
+        "type": "INTEGER",
+        "description": "The CSET_id for a company."
+      },
+      {
+        "mode": "REQUIRED",
+        "name": "highly_cited_ai_pubs",
+        "type": "INTEGER",
+        "description": "Count of highly cited AI papers."
+      },
+      {
+        "fields": [
+          {
+            "mode": "NULLABLE",
+            "name": "year",
+            "type": "INTEGER",
+            "description": "Publication year of papers."
+          },
+          {
+            "mode": "NULLABLE",
+            "name": "highly_cited_ai_pubs",
+            "type": "INTEGER",
+            "description": "Count of highly cited AI papers published in that year."
+          }
+        ],
+        "mode": "REPEATED",
+        "name": "highly_cited_ai_pubs_by_year",
+        "type": "RECORD",
+        "description": "Highly cited AI papers by year."
+      }
+  ]
diff --git a/company_linkage/sequences/initial_data.csv b/company_linkage/sequences/initial_data.csv
@@ -5,4 +5,5 @@ staging_ai_companies_visualization,linked_ai_patents_grants
 staging_ai_companies_visualization,linked_all_patents
 staging_ai_companies_visualization,top_conference_pubs
 staging_ai_companies_visualization,pubs_in_top_conferences
+staging_ai_companies_visualization,highly_cited_ai_publications
 staging_ai_companies_visualization,all_publications
diff --git a/company_linkage/sequences/visualization_data.csv b/company_linkage/sequences/visualization_data.csv
@@ -1,6 +1,7 @@
 staging_ai_companies_visualization,initial_visualization_data
 staging_ai_companies_visualization,visualization_data_with_by_year
 staging_ai_companies_visualization,visualization_data_with_top_papers
+staging_ai_companies_visualization,visualization_data_with_highly_cited
 staging_ai_companies_visualization,visualization_data_with_all_papers
 staging_ai_companies_visualization,initial_patent_visualization_data
 staging_ai_companies_visualization,patent_visualization_data_with_by_year

diff --git a/company_linkage/sql/highly_cited_ai_publications.sql b/company_linkage/sql/highly_cited_ai_publications.sql
@@ -0,0 +1,60 @@
+WITH
+  ai_pubs AS (
+  SELECT
+    merged_id,
+    ror_id,
+    org_name,
+    country,
+    year
+  FROM
+    staging_ai_companies_visualization.ai_publications
+  ),
+
+  citation_counts AS (
+  SELECT
+    DISTINCT ref_id AS merged_id,
+    COUNT(DISTINCT
+    REFERENCES
+      .merged_id) AS citation_count,
+    ror_id,
+    org_name,
+    country,
+    year
+  FROM
+    literature.references
+  INNER JOIN
+    ai_pubs
+  ON
+    ref_id = ai_pubs.merged_id
+  GROUP BY
+    ref_id,
+    ror_id,
+    org_name,
+    country,
+    year
+  ),
+
+  get_top_cited AS (
+  SELECT
+    DISTINCT merged_id,
+    citation_count,
+    ror_id,
+    org_name,
+    country,
+  IF
+    (citation_count >= PERCENTILE_CONT(citation_count, 0.9) OVER(PARTITION BY year), TRUE, FALSE) AS top_cited,
+    year
+  FROM
+    citation_counts
+  )
+
+SELECT
+  DISTINCT merged_id,
+  ror_id,
+  org_name,
+  country,
+  year
+FROM
+  get_top_cited
+WHERE
+  top_cited IS true
diff --git a/company_linkage/sql/visualization_data_with_all_papers.sql b/company_linkage/sql/visualization_data_with_all_papers.sql
@@ -13,7 +13,7 @@ WITH
   SELECT
     *
   FROM
-    staging_ai_companies_visualization.visualization_data_with_top_papers)
+    staging_ai_companies_visualization.visualization_data_with_highly_cited)
   -- Join the two together using the CSET id
 SELECT
   viz_data.*,

diff --git a/company_linkage/sql/visualization_data_with_highly_cited.sql b/company_linkage/sql/visualization_data_with_highly_cited.sql
@@ -0,0 +1,27 @@
+  -- Update the visualization table itself to add highly cited paper data
+  -- Pull in the highly cited paper counts, along with the CSET ids to link them in
+WITH
+  count_data AS (
+  SELECT
+    CSET_id,
+    highly_cited_ai_pubs,
+    highly_cited_ai_pubs_by_year,
+  FROM
+    staging_ai_companies_visualization.highly_cited_paper_counts),
+  -- Pull in the current visualization data.
+  viz_data AS (
+  SELECT
+    *
+  FROM
+    staging_ai_companies_visualization.visualization_data_with_top_papers)
+  -- Join the two together using the CSET id
+SELECT
+  viz_data.*,
+  highly_cited_ai_pubs,
+  highly_cited_ai_pubs_by_year,
+FROM
+  viz_data
+LEFT JOIN
+  count_data
+ON
+  viz_data.CSET_id = count_data.CSET_id