Merge pull request #196 from georgetown-cset/146-add-patents-granted-…

…metric Add grants data for patents
georgetown-cset · Jan 29, 2024 · 8cd2d58 · 8cd2d58
2 parents df731b2 + 0cb39ca
commit 8cd2d58
Show file tree

Hide file tree

Showing 23 changed files with 12,420 additions and 11,844 deletions.
diff --git a/company_linkage/README.md b/company_linkage/README.md
@@ -14,42 +14,7 @@ script to get top conference papers and total papers, these tests will work for
 This code is dependent on internal CSET BigQuery datasets; without access to these datasets, you will not be able to
 run some of this code as-is.
 
-## Tasks to build visualization data
-
-1. [organizations.sql](sql/organizations.sql)
-2. [ai_publications.sql](sql/ai_publications.sql)
-3. [linked_ai_patents.sql](sql/linked_ai_patents.sql)
-4. [top_conference_pubs.sql](sql/top_conference_pubs.sql)
-5. [pubs_in_top_conferences.sql](sql/pubs_in_top_conferences.sql)
-6. [all_publications.sql](sql/all_publications.sql)
-7. `python3 aggregate_organizations.py aggregated_organizations.jsonl`
-8. Replace `high_resolution_entities.aggregated_organizations` with the data from `aggregated_organizations.jsonl` using the [aggregated_organizations_schema](schemas/aggregated_organizations_schema.json)
-9. `python3 get_ai_counts.py data/ai_company_papers.jsonl data/ai_company_patents.jsonl` 
-10. Upload `ai_company_papers.jsonl` to `ai_companies_visualization.ai_company_pubs` using the [ai_papers_schema](schemas/ai_papers_schema.json)
-11. Upload `ai_company_patents.jsonl` to `ai_companies_visualization.ai_company_patents` using the [ai_patents_schema](schemas/ai_patents_schema.json)
-12. `python3 top_papers.py top_paper_counts.jsonl`
-13. Upload `top_paper_counts.jsonl` to `ai_companies_visualization.top_paper_counts` using the [top_papers_schema](schemas/top_papers_schema.json)
-14. `python3 all_papers.py all_paper_counts.jsonl`
-15. Upload `all_paper_counts.jsonl` to `ai_companies_visualization.total_paper_counts` using the [all_papers_schema](schemas/all_papers_schema.json)
-16. [initial_visualization_data.sql](sql/initial_visualization_data.sql)
-17. [visualization_data_with_by_year.sql](sql/visualization_data_with_by_year.sql)
-18. [visualization_data_with_top_papers.sql](sql/visualization_data_with_top_papers.sql)
-19. [visualization_data_with_all_papers.sql](sql/visualization_data_with_all_papers.sql)
-20. [initial_patent_visualization_data.sql](sql/initial_patent_visualization_data.sql)
-21. [patent_visualization_data_with_by_year.sql](sql/patent_visualization_data_with_by_year.sql)
-22. [initial_paper_visualization_data.sql](sql/initial_paper_visualization_data.sql)
-23. [paper_visualization_data_with_mag.sql](sql/paper_visualization_data_with_mag.sql)
-24. [paper_visualization_data_with_clusters.sql](sql/paper_visualization_data_with_clusters.sql)
-25. [paper_visualization_data_with_company_references.sql](sql/paper_visualization_data_with_company_references.sql)
-26. [paper_visualization_data_with_tasks.sql](sql/paper_visualization_data_with_tasks.sql)
-27. [paper_visualization_data_with_methods.sql](sql/paper_visualization_data_with_methods.sql)
-28. [initial_workforce_visualization_data.sql](sql/initial_workforce_visualization_data.sql)
-29. [workforce_visualization_data_with_ai_jobs.sql](sql/workforce_visualization_data_with_ai_jobs.sql)
-30. [visualization_data_omit_by_rule.sql](sql/visualization_data_omit_by_rule.sql)
-31. [visualization_data.sql](sql/visualization_data.sql)
-32. [patent_visualization_data.sql](sql/patent_visualization_data.sql)
-33. [paper_visualization_data.sql](sql/paper_visualization_data.sql)
-34. [workforce_visualization_data.sql](sql/workforce_visualization_data.sql)
+To view the order of tasks necessary to build visualization data, see the airflow DAG.
 
 # Deployment
 

diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py
@@ -153,7 +153,8 @@
         cmds=["/bin/bash"],
         arguments=["-c", (f"echo 'getting AI counts!' ; rm -r ai || true ; "
                           f"mkdir -p ai && "
-                          f"python3 get_ai_counts.py ai/ai_company_papers.jsonl ai/ai_company_patents.jsonl && "
+                          f"python3 get_ai_counts.py ai/ai_company_papers.jsonl ai/ai_company_patents.jsonl "
+                          f"ai/ai_company_patent_grants.jsonl && "
                           f"gsutil -m cp -r ai gs://{DATA_BUCKET}/{tmp_dir}/ ")],
         namespace="default",
         image=f"us.gcr.io/{PROJECT_ID}/parat",
@@ -199,6 +200,17 @@
         write_disposition="WRITE_TRUNCATE"
     )
 
+    load_ai_patent_grants = GCSToBigQueryOperator(
+        task_id=f"load_ai_company_patent_grants",
+        bucket=DATA_BUCKET,
+        source_objects=[f"{tmp_dir}/ai/ai_company_patent_grants.jsonl"],
+        schema_object=f"{schema_dir}/ai_patents_schema.json",
+        destination_project_dataset_table=f"{staging_dataset}.ai_company_patent_grants",
+        source_format="NEWLINE_DELIMITED_JSON",
+        create_disposition="CREATE_IF_NEEDED",
+        write_disposition="WRITE_TRUNCATE"
+    )
+
     run_papers = []
     for paper_type in ["top", "all"]:
 
@@ -350,6 +362,7 @@
         >> run_get_ai_counts
         >> load_ai_papers
         >> load_ai_patents
+        >> load_ai_patent_grants
         >> run_papers
         >> load_top_papers
         >> load_all_papers

diff --git a/company_linkage/parat_scripts/get_ai_counts.py b/company_linkage/parat_scripts/get_ai_counts.py
@@ -191,7 +191,7 @@ def run_query_id_papers(self, table_name: str, test: bool = False) -> list:
                                 "nlp": element["nlp"], "robotics": element["robotics"]})
         return company_rows
 
-    def run_query_id_patents(self):
+    def run_query_id_patents(self, table_name: str):
         """
         Get patent counts one by one using CSET_ids.
         :return:
@@ -240,7 +240,7 @@ def run_query_id_patents(self):
                               Machine_Learning,
                               Search_Methods
                               FROM
-                            staging_ai_companies_visualization.linked_ai_patents
+                            staging_ai_companies_visualization.{table_name}
                              WHERE regexp_contains(assignee, r'(?i){regexes[0]}') """
                 # if we have more than one regex for an org, include all of them
                 if len(regexes) > 1:
@@ -274,8 +274,12 @@ def write_output(self, row_list: list, output_file) -> None:
 
 def main() -> None:
     parser = argparse.ArgumentParser()
-    parser.add_argument("output_papers", type=str, help="A jsonl file for writing output paper data to create new tables")
-    parser.add_argument("output_patents", type=str, help="A jsonl file for writing output patent data to create new tables")
+    parser.add_argument("output_papers", type=str,
+                        help="A jsonl file for writing output paper data to create new tables")
+    parser.add_argument("output_patents", type=str,
+                        help="A jsonl file for writing output patent data to create new tables")
+    parser.add_argument("output_patent_grants", type=str,
+                        help="A jsonl file for writing output patent grants data to create new tables")
     args = parser.parse_args()
     if "jsonl" not in args.output_papers or "jsonl" not in args.output_patents:
         parser.print_help()
@@ -287,10 +291,14 @@ def main() -> None:
     company_rows = count_getter.run_query_id_papers(table_name)
     print("Writing results")
     count_getter.write_output(company_rows, args.output_papers)
-    print("Fetching patent data")
-    patent_companies = count_getter.run_query_id_patents()
+    print("Fetching patent applications data")
+    patent_companies = count_getter.run_query_id_patents("linked_ai_patents")
     print("Writing results")
     count_getter.write_output(patent_companies, args.output_patents)
+    print("Fetching patent grants data")
+    patent_grant_companies = count_getter.run_query_id_patents("linked_ai_patents_grants")
+    print("Writing results")
+    count_getter.write_output(patent_grant_companies, args.output_patent_grants)
 
 if __name__ == "__main__":
     main()
diff --git a/company_linkage/sequences/initial_data.csv b/company_linkage/sequences/initial_data.csv
@@ -1,6 +1,7 @@
 high_resolution_entities,organizations
 staging_ai_companies_visualization,ai_publications
 staging_ai_companies_visualization,linked_ai_patents
+staging_ai_companies_visualization,linked_ai_patents_grants
 staging_ai_companies_visualization,top_conference_pubs
 staging_ai_companies_visualization,pubs_in_top_conferences
 staging_ai_companies_visualization,all_publications
diff --git a/company_linkage/sequences/visualization_data.csv b/company_linkage/sequences/visualization_data.csv
@@ -4,6 +4,8 @@ staging_ai_companies_visualization,visualization_data_with_top_papers
 staging_ai_companies_visualization,visualization_data_with_all_papers
 staging_ai_companies_visualization,initial_patent_visualization_data
 staging_ai_companies_visualization,patent_visualization_data_with_by_year
+staging_ai_companies_visualization,patent_visualization_data_with_grants
+staging_ai_companies_visualization,patent_visualization_data_with_grants_by_year
 staging_ai_companies_visualization,initial_paper_visualization_data
 staging_ai_companies_visualization,paper_visualization_data_with_mag
 staging_ai_companies_visualization,paper_visualization_data_with_clusters

diff --git a/company_linkage/sql/linked_ai_patents.sql b/company_linkage/sql/linked_ai_patents.sql
@@ -1,9 +1,9 @@
--- Pulling every AI-associated patent family id linked to every grid id of any assignee for that patent, and all the assignee names
+-- Pulling every AI-associated patent family id linked to every ror id of any assignee for that patent, and all the assignee names
 -- We also pull in the AI subcategories and the years
 -- We also attempt to add in "fake" families for the patents that are missing patent families
 with patents_orig as (
 SELECT
-  -- Pulling in the current assignee ror ids from dimensions
+  -- Pulling in the current assignee ror ids
   patent_id,
   family_id,
   assignee,

diff --git a/company_linkage/sql/linked_ai_patents_grants.sql b/company_linkage/sql/linked_ai_patents_grants.sql
@@ -0,0 +1,121 @@
+-- Pulling every AI-associated patent family id linked to every grid id of any assignee for that patent, and all the assignee names
+-- We also pull in the AI subcategories and the years
+-- We also attempt to add in "fake" families for the patents that are missing patent families
+with patents_orig as (
+SELECT
+  -- Pulling in the current assignee ror ids from dimensions
+  patent_id,
+  assignees_normalized.family_id,
+  assignee,
+  ror_id,
+  granted
+FROM
+  unified_patents.assignees_normalized
+  LEFT JOIN
+    unified_patents.metadata
+  USING
+    (patent_id)),
+all_ai as (
+  -- Selecting all the family ids and patent IDs to get AI patents
+  -- Also select the year so we can get counts by year
+    SELECT
+      patent_id,
+      Physical_Sciences_and_Engineering,
+      Life_Sciences,
+      Security__eg_cybersecurity,
+      Transportation,
+      Industrial_and_Manufacturing,
+      Education,
+      Document_Mgt_and_Publishing,
+      Military,
+      Agricultural,
+      Computing_in_Government,
+      Personal_Devices_and_Computing,
+      Banking_and_Finance,
+      Telecommunications,
+      Networks__eg_social_IOT_etc,
+      Business,
+      Energy_Management,
+      Entertainment,
+      Nanotechnology,
+      Semiconductors,
+      Language_Processing,
+      Speech_Processing,
+      Knowledge_Representation,
+      Planning_and_Scheduling,
+      Control, Distributed_AI,
+      Robotics,
+      Computer_Vision,
+      Analytics_and_Algorithms,
+      Measuring_and_Testing,
+      Logic_Programming,
+      Fuzzy_Logic,
+      Probabilistic_Reasoning,
+      Ontology_Engineering,
+      Machine_Learning,
+      Search_Methods
+    FROM
+      unified_patents.ai_patents),
+  patent_years as (
+  SELECT
+      patent_id,
+      EXTRACT(year FROM first_priority_date) as priority_year
+    FROM
+      unified_patents.dates
+  )
+  SELECT
+    DISTINCT
+    -- If the family id is null we can't group by family id so we create a fake family id using the patent id
+    -- Since we can't group by family id there should only be one patent id in these cases
+    -- We're just doing this so our counts aren't blank
+    COALESCE(family_id, "X-" || patent_id) as family_id,
+    assignee,
+    ror_id,
+    MIN(priority_year) as priority_year,
+    LOGICAL_OR(granted) as granted,
+    LOGICAL_OR(Physical_Sciences_and_Engineering) as Physical_Sciences_and_Engineering,
+    LOGICAL_OR(Life_Sciences) as Life_Sciences,
+    LOGICAL_OR(Security__eg_cybersecurity) as Security__eg_cybersecurity,
+    LOGICAL_OR(Transportation) as Transportation,
+    LOGICAL_OR(Industrial_and_Manufacturing) as Industrial_and_Manufacturing,
+    LOGICAL_OR(Education) as Education,
+    LOGICAL_OR(Document_Mgt_and_Publishing) as Document_Mgt_and_Publishing,
+    LOGICAL_OR(Military) as Military,
+    LOGICAL_OR(Agricultural) as Agricultural,
+    LOGICAL_OR(Computing_in_Government) as Computing_in_Government,
+    LOGICAL_OR(Personal_Devices_and_Computing) as Personal_Devices_and_Computing,
+    LOGICAL_OR(Banking_and_Finance) as Banking_and_Finance,
+    LOGICAL_OR(Telecommunications) as Telecommunications,
+    LOGICAL_OR(Networks__eg_social_IOT_etc) as Networks__eg_social_IOT_etc,
+    LOGICAL_OR(Business) as Business,
+    LOGICAL_OR(Energy_Management) as Energy_Management,
+    LOGICAL_OR(Entertainment) as Entertainment,
+    LOGICAL_OR(Nanotechnology) as Nanotechnology,
+    LOGICAL_OR(Semiconductors) as Semiconductors,
+    LOGICAL_OR(Language_Processing) as Language_Processing,
+    LOGICAL_OR(Speech_Processing) as Speech_Processing,
+    LOGICAL_OR(Knowledge_Representation) as Knowledge_Representation,
+    LOGICAL_OR(Planning_and_Scheduling) as Planning_and_Scheduling,
+    LOGICAL_OR(Control) as Control,
+    LOGICAL_OR(Distributed_AI) as Distributed_AI,
+    LOGICAL_OR(Robotics) as Robotics,
+    LOGICAL_OR(Computer_Vision) as Computer_Vision,
+    LOGICAL_OR(Analytics_and_Algorithms) as Analytics_and_Algorithms,
+    LOGICAL_OR(Measuring_and_Testing) as Measuring_and_Testing,
+    LOGICAL_OR(Logic_Programming) as Logic_Programming,
+    LOGICAL_OR(Fuzzy_Logic) as Fuzzy_Logic,
+    LOGICAL_OR(Probabilistic_Reasoning) as Probabilistic_Reasoning,
+    LOGICAL_OR(Ontology_Engineering) as Ontology_Engineering,
+    LOGICAL_OR(Machine_Learning) as Machine_Learning,
+    LOGICAL_OR(Search_Methods) as Search_Methods
+    -- Only including patents if their ids are in or AI patent set, ensuring we have AI patents
+  FROM ( all_ai
+      LEFT JOIN patents_orig
+        USING (patent_id)
+      LEFT JOIN patent_years
+        USING (patent_id))
+  WHERE priority_year IS NOT NULL and granted is true
+  GROUP BY
+    ror_id,
+    assignee,
+    family_id
diff --git a/company_linkage/sql/patent_visualization_data.sql b/company_linkage/sql/patent_visualization_data.sql
@@ -11,13 +11,13 @@ WITH
   FROM
     staging_ai_companies_visualization.visualization_data_omit_by_rule
   RIGHT JOIN
-    staging_ai_companies_visualization.patent_visualization_data_with_by_year
+    staging_ai_companies_visualization.patent_visualization_data_with_grants_by_year
   USING (cset_id)
   WHERE visualization_data_omit_by_rule.cset_id IS NULL)
 SELECT
   *
 FROM
-  staging_ai_companies_visualization.patent_visualization_data_with_by_year
+  staging_ai_companies_visualization.patent_visualization_data_with_grants_by_year
 WHERE
   CSET_id NOT IN (
   SELECT