diff --git a/company_linkage/parat_data_dag.py b/company_linkage/parat_data_dag.py index a375651d..62cb48b1 100644 --- a/company_linkage/parat_data_dag.py +++ b/company_linkage/parat_data_dag.py @@ -212,7 +212,7 @@ ) run_papers = [] - for paper_type in ["top_paper", "all_paper", "all_patent"]: + for paper_type in ["top_paper", "highly_cited_paper", "all_paper", "all_patent"]: run_get_paper_counts = GKEStartPodOperator( task_id=f"run_get_{paper_type}_counts", @@ -262,6 +262,17 @@ write_disposition="WRITE_TRUNCATE" ) + load_highly_cited_papers = GCSToBigQueryOperator( + task_id=f"load_highly_cited_papers", + bucket=DATA_BUCKET, + source_objects=[f"{tmp_dir}/highly_cited_paper/highly_cited_paper_counts.jsonl"], + schema_object=f"{schema_dir}/highly_cited_papers_schema.json", + destination_project_dataset_table=f"{staging_dataset}.highly_cited_paper_counts", + source_format="NEWLINE_DELIMITED_JSON", + create_disposition="CREATE_IF_NEEDED", + write_disposition="WRITE_TRUNCATE" + ) + load_all_papers = GCSToBigQueryOperator( task_id=f"load_all_papers", bucket=DATA_BUCKET, @@ -376,6 +387,7 @@ >> load_ai_patent_grants >> run_papers >> load_top_papers + >> load_highly_cited_papers >> load_all_papers >> load_all_patents >> start_visualization_tables diff --git a/company_linkage/parat_scripts/highly_cited_papers.py b/company_linkage/parat_scripts/highly_cited_papers.py new file mode 100644 index 00000000..b8b2ca08 --- /dev/null +++ b/company_linkage/parat_scripts/highly_cited_papers.py @@ -0,0 +1,28 @@ +import argparse + +from get_ai_counts import CountGetter + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("output_file", type=str, + help="A jsonl file for writing output data to create new tables") + args = parser.parse_args() + if not args.output_file: + parser.print_help() + return + if "jsonl" not in args.output_file: + parser.print_help() + return + paper_finder = CountGetter() + paper_finder.get_identifiers() + # These are the only two lines that make this different from running AI pubs + # We select from a different table + table_name = "staging_ai_companies_visualization.highly_cited_ai_publications" + # And we write out our data to a different variable + companies = paper_finder.run_query_papers(table_name, "highly_cited_ai_pubs", by_year=True) + paper_finder.write_output(companies, args.output_file) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/company_linkage/schemas/highly_cited_papers_schema.json b/company_linkage/schemas/highly_cited_papers_schema.json new file mode 100644 index 00000000..4b065075 --- /dev/null +++ b/company_linkage/schemas/highly_cited_papers_schema.json @@ -0,0 +1,34 @@ +[ + { + "mode": "REQUIRED", + "name": "CSET_id", + "type": "INTEGER", + "description": "The CSET_id for a company." + }, + { + "mode": "REQUIRED", + "name": "highly_cited_ai_pubs", + "type": "INTEGER", + "description": "Count of highly cited AI papers." + }, + { + "fields": [ + { + "mode": "NULLABLE", + "name": "year", + "type": "INTEGER", + "description": "Publication year of papers." + }, + { + "mode": "NULLABLE", + "name": "highly_cited_ai_pubs", + "type": "INTEGER", + "description": "Count of highly cited AI papers published in that year." + } + ], + "mode": "REPEATED", + "name": "highly_cited_ai_pubs_by_year", + "type": "RECORD", + "description": "Highly cited AI papers by year." + } + ] \ No newline at end of file diff --git a/company_linkage/sequences/initial_data.csv b/company_linkage/sequences/initial_data.csv index 28cb66c9..02f3bd54 100644 --- a/company_linkage/sequences/initial_data.csv +++ b/company_linkage/sequences/initial_data.csv @@ -5,4 +5,5 @@ staging_ai_companies_visualization,linked_ai_patents_grants staging_ai_companies_visualization,linked_all_patents staging_ai_companies_visualization,top_conference_pubs staging_ai_companies_visualization,pubs_in_top_conferences +staging_ai_companies_visualization,highly_cited_ai_publications staging_ai_companies_visualization,all_publications \ No newline at end of file diff --git a/company_linkage/sequences/visualization_data.csv b/company_linkage/sequences/visualization_data.csv index d52aff91..63d8077f 100644 --- a/company_linkage/sequences/visualization_data.csv +++ b/company_linkage/sequences/visualization_data.csv @@ -1,6 +1,7 @@ staging_ai_companies_visualization,initial_visualization_data staging_ai_companies_visualization,visualization_data_with_by_year staging_ai_companies_visualization,visualization_data_with_top_papers +staging_ai_companies_visualization,visualization_data_with_highly_cited staging_ai_companies_visualization,visualization_data_with_all_papers staging_ai_companies_visualization,initial_patent_visualization_data staging_ai_companies_visualization,patent_visualization_data_with_by_year diff --git a/company_linkage/sql/highly_cited_ai_publications.sql b/company_linkage/sql/highly_cited_ai_publications.sql new file mode 100644 index 00000000..03346174 --- /dev/null +++ b/company_linkage/sql/highly_cited_ai_publications.sql @@ -0,0 +1,60 @@ +WITH + ai_pubs AS ( + SELECT + merged_id, + ror_id, + org_name, + country, + year + FROM + staging_ai_companies_visualization.ai_publications + ), + + citation_counts AS ( + SELECT + DISTINCT ref_id AS merged_id, + COUNT(DISTINCT + REFERENCES + .merged_id) AS citation_count, + ror_id, + org_name, + country, + year + FROM + literature.references + INNER JOIN + ai_pubs + ON + ref_id = ai_pubs.merged_id + GROUP BY + ref_id, + ror_id, + org_name, + country, + year + ), + + get_top_cited AS ( + SELECT + DISTINCT merged_id, + citation_count, + ror_id, + org_name, + country, + IF + (citation_count >= PERCENTILE_CONT(citation_count, 0.9) OVER(PARTITION BY year), TRUE, FALSE) AS top_cited, + year + FROM + citation_counts + ) + +SELECT + DISTINCT merged_id, + ror_id, + org_name, + country, + year +FROM + get_top_cited +WHERE + top_cited IS true \ No newline at end of file diff --git a/company_linkage/sql/visualization_data_with_all_papers.sql b/company_linkage/sql/visualization_data_with_all_papers.sql index 67901809..63c54121 100644 --- a/company_linkage/sql/visualization_data_with_all_papers.sql +++ b/company_linkage/sql/visualization_data_with_all_papers.sql @@ -13,7 +13,7 @@ WITH SELECT * FROM - staging_ai_companies_visualization.visualization_data_with_top_papers) + staging_ai_companies_visualization.visualization_data_with_highly_cited) -- Join the two together using the CSET id SELECT viz_data.*, diff --git a/company_linkage/sql/visualization_data_with_highly_cited.sql b/company_linkage/sql/visualization_data_with_highly_cited.sql new file mode 100644 index 00000000..d23ac8d0 --- /dev/null +++ b/company_linkage/sql/visualization_data_with_highly_cited.sql @@ -0,0 +1,27 @@ + -- Update the visualization table itself to add highly cited paper data + -- Pull in the highly cited paper counts, along with the CSET ids to link them in +WITH + count_data AS ( + SELECT + CSET_id, + highly_cited_ai_pubs, + highly_cited_ai_pubs_by_year, + FROM + staging_ai_companies_visualization.highly_cited_paper_counts), + -- Pull in the current visualization data. + viz_data AS ( + SELECT + * + FROM + staging_ai_companies_visualization.visualization_data_with_top_papers) + -- Join the two together using the CSET id +SELECT + viz_data.*, + highly_cited_ai_pubs, + highly_cited_ai_pubs_by_year, +FROM + viz_data +LEFT JOIN + count_data +ON + viz_data.CSET_id = count_data.CSET_id \ No newline at end of file