Skip to content

Commit

Permalink
Add highly cited AI paper counts
Browse files Browse the repository at this point in the history
  • Loading branch information
rggelles committed Jan 30, 2024
1 parent 11d1e24 commit cef8444
Show file tree
Hide file tree
Showing 8 changed files with 165 additions and 2 deletions.
14 changes: 13 additions & 1 deletion company_linkage/parat_data_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@
)

run_papers = []
for paper_type in ["top_paper", "all_paper", "all_patent"]:
for paper_type in ["top_paper", "highly_cited_paper", "all_paper", "all_patent"]:

run_get_paper_counts = GKEStartPodOperator(
task_id=f"run_get_{paper_type}_counts",
Expand Down Expand Up @@ -262,6 +262,17 @@
write_disposition="WRITE_TRUNCATE"
)

load_highly_cited_papers = GCSToBigQueryOperator(
task_id=f"load_highly_cited_papers",
bucket=DATA_BUCKET,
source_objects=[f"{tmp_dir}/highly_cited_paper/highly_cited_paper_counts.jsonl"],
schema_object=f"{schema_dir}/highly_cited_papers_schema.json",
destination_project_dataset_table=f"{staging_dataset}.highly_cited_paper_counts",
source_format="NEWLINE_DELIMITED_JSON",
create_disposition="CREATE_IF_NEEDED",
write_disposition="WRITE_TRUNCATE"
)

load_all_papers = GCSToBigQueryOperator(
task_id=f"load_all_papers",
bucket=DATA_BUCKET,
Expand Down Expand Up @@ -376,6 +387,7 @@
>> load_ai_patent_grants
>> run_papers
>> load_top_papers
>> load_highly_cited_papers
>> load_all_papers
>> load_all_patents
>> start_visualization_tables
Expand Down
28 changes: 28 additions & 0 deletions company_linkage/parat_scripts/highly_cited_papers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import argparse

from get_ai_counts import CountGetter


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("output_file", type=str,
help="A jsonl file for writing output data to create new tables")
args = parser.parse_args()
if not args.output_file:
parser.print_help()
return
if "jsonl" not in args.output_file:
parser.print_help()
return
paper_finder = CountGetter()
paper_finder.get_identifiers()
# These are the only two lines that make this different from running AI pubs
# We select from a different table
table_name = "staging_ai_companies_visualization.highly_cited_ai_publications"
# And we write out our data to a different variable
companies = paper_finder.run_query_papers(table_name, "highly_cited_ai_pubs", by_year=True)
paper_finder.write_output(companies, args.output_file)


if __name__ == "__main__":
main()
34 changes: 34 additions & 0 deletions company_linkage/schemas/highly_cited_papers_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
[
{
"mode": "REQUIRED",
"name": "CSET_id",
"type": "INTEGER",
"description": "The CSET_id for a company."
},
{
"mode": "REQUIRED",
"name": "highly_cited_ai_pubs",
"type": "INTEGER",
"description": "Count of highly cited AI papers."
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "year",
"type": "INTEGER",
"description": "Publication year of papers."
},
{
"mode": "NULLABLE",
"name": "highly_cited_ai_pubs",
"type": "INTEGER",
"description": "Count of highly cited AI papers published in that year."
}
],
"mode": "REPEATED",
"name": "highly_cited_ai_pubs_by_year",
"type": "RECORD",
"description": "Highly cited AI papers by year."
}
]
1 change: 1 addition & 0 deletions company_linkage/sequences/initial_data.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ staging_ai_companies_visualization,linked_ai_patents_grants
staging_ai_companies_visualization,linked_all_patents
staging_ai_companies_visualization,top_conference_pubs
staging_ai_companies_visualization,pubs_in_top_conferences
staging_ai_companies_visualization,highly_cited_ai_publications
staging_ai_companies_visualization,all_publications
1 change: 1 addition & 0 deletions company_linkage/sequences/visualization_data.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
staging_ai_companies_visualization,initial_visualization_data
staging_ai_companies_visualization,visualization_data_with_by_year
staging_ai_companies_visualization,visualization_data_with_top_papers
staging_ai_companies_visualization,visualization_data_with_highly_cited
staging_ai_companies_visualization,visualization_data_with_all_papers
staging_ai_companies_visualization,initial_patent_visualization_data
staging_ai_companies_visualization,patent_visualization_data_with_by_year
Expand Down
60 changes: 60 additions & 0 deletions company_linkage/sql/highly_cited_ai_publications.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
WITH
ai_pubs AS (
SELECT
merged_id,
ror_id,
org_name,
country,
year
FROM
staging_ai_companies_visualization.ai_publications
),

citation_counts AS (
SELECT
DISTINCT ref_id AS merged_id,
COUNT(DISTINCT
REFERENCES
.merged_id) AS citation_count,
ror_id,
org_name,
country,
year
FROM
literature.references
INNER JOIN
ai_pubs
ON
ref_id = ai_pubs.merged_id
GROUP BY
ref_id,
ror_id,
org_name,
country,
year
),

get_top_cited AS (
SELECT
DISTINCT merged_id,
citation_count,
ror_id,
org_name,
country,
IF
(citation_count >= PERCENTILE_CONT(citation_count, 0.9) OVER(PARTITION BY year), TRUE, FALSE) AS top_cited,
year
FROM
citation_counts
)

SELECT
DISTINCT merged_id,
ror_id,
org_name,
country,
year
FROM
get_top_cited
WHERE
top_cited IS true
2 changes: 1 addition & 1 deletion company_linkage/sql/visualization_data_with_all_papers.sql
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ WITH
SELECT
*
FROM
staging_ai_companies_visualization.visualization_data_with_top_papers)
staging_ai_companies_visualization.visualization_data_with_highly_cited)
-- Join the two together using the CSET id
SELECT
viz_data.*,
Expand Down
27 changes: 27 additions & 0 deletions company_linkage/sql/visualization_data_with_highly_cited.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
-- Update the visualization table itself to add highly cited paper data
-- Pull in the highly cited paper counts, along with the CSET ids to link them in
WITH
count_data AS (
SELECT
CSET_id,
highly_cited_ai_pubs,
highly_cited_ai_pubs_by_year,
FROM
staging_ai_companies_visualization.highly_cited_paper_counts),
-- Pull in the current visualization data.
viz_data AS (
SELECT
*
FROM
staging_ai_companies_visualization.visualization_data_with_top_papers)
-- Join the two together using the CSET id
SELECT
viz_data.*,
highly_cited_ai_pubs,
highly_cited_ai_pubs_by_year,
FROM
viz_data
LEFT JOIN
count_data
ON
viz_data.CSET_id = count_data.CSET_id

0 comments on commit cef8444

Please sign in to comment.