Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add highly cited AI paper counts #205

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion company_linkage/parat_data_dag.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@
)

run_papers = []
for paper_type in ["top_paper", "all_paper", "all_patent"]:
for paper_type in ["top_paper", "highly_cited_paper", "all_paper", "all_patent"]:

run_get_paper_counts = GKEStartPodOperator(
task_id=f"run_get_{paper_type}_counts",
Expand Down Expand Up @@ -262,6 +262,17 @@
write_disposition="WRITE_TRUNCATE"
)

load_highly_cited_papers = GCSToBigQueryOperator(
task_id=f"load_highly_cited_papers",
bucket=DATA_BUCKET,
source_objects=[f"{tmp_dir}/highly_cited_paper/highly_cited_paper_counts.jsonl"],
schema_object=f"{schema_dir}/highly_cited_papers_schema.json",
destination_project_dataset_table=f"{staging_dataset}.highly_cited_paper_counts",
source_format="NEWLINE_DELIMITED_JSON",
create_disposition="CREATE_IF_NEEDED",
write_disposition="WRITE_TRUNCATE"
)

load_all_papers = GCSToBigQueryOperator(
task_id=f"load_all_papers",
bucket=DATA_BUCKET,
Expand Down Expand Up @@ -376,6 +387,7 @@
>> load_ai_patent_grants
>> run_papers
>> load_top_papers
>> load_highly_cited_papers
>> load_all_papers
>> load_all_patents
>> start_visualization_tables
Expand Down
28 changes: 28 additions & 0 deletions company_linkage/parat_scripts/highly_cited_papers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import argparse

from get_ai_counts import CountGetter


def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("output_file", type=str,
help="A jsonl file for writing output data to create new tables")
args = parser.parse_args()
if not args.output_file:
parser.print_help()
return
if "jsonl" not in args.output_file:
parser.print_help()
return
paper_finder = CountGetter()
paper_finder.get_identifiers()
# These are the only two lines that make this different from running AI pubs
# We select from a different table
table_name = "staging_ai_companies_visualization.highly_cited_ai_publications"
# And we write out our data to a different variable
companies = paper_finder.run_query_papers(table_name, "highly_cited_ai_pubs", by_year=True)
paper_finder.write_output(companies, args.output_file)


if __name__ == "__main__":
main()
34 changes: 34 additions & 0 deletions company_linkage/schemas/highly_cited_papers_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
[
{
"mode": "REQUIRED",
"name": "CSET_id",
"type": "INTEGER",
"description": "The CSET_id for a company."
},
{
"mode": "REQUIRED",
"name": "highly_cited_ai_pubs",
"type": "INTEGER",
"description": "Count of highly cited AI papers."
},
{
"fields": [
{
"mode": "NULLABLE",
"name": "year",
"type": "INTEGER",
"description": "Publication year of papers."
},
{
"mode": "NULLABLE",
"name": "highly_cited_ai_pubs",
"type": "INTEGER",
"description": "Count of highly cited AI papers published in that year."
}
],
"mode": "REPEATED",
"name": "highly_cited_ai_pubs_by_year",
"type": "RECORD",
"description": "Highly cited AI papers by year."
}
]
1 change: 1 addition & 0 deletions company_linkage/sequences/initial_data.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ staging_ai_companies_visualization,linked_ai_patents_grants
staging_ai_companies_visualization,linked_all_patents
staging_ai_companies_visualization,top_conference_pubs
staging_ai_companies_visualization,pubs_in_top_conferences
staging_ai_companies_visualization,highly_cited_ai_publications
staging_ai_companies_visualization,all_publications
1 change: 1 addition & 0 deletions company_linkage/sequences/visualization_data.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
staging_ai_companies_visualization,initial_visualization_data
staging_ai_companies_visualization,visualization_data_with_by_year
staging_ai_companies_visualization,visualization_data_with_top_papers
staging_ai_companies_visualization,visualization_data_with_highly_cited
staging_ai_companies_visualization,visualization_data_with_all_papers
staging_ai_companies_visualization,initial_patent_visualization_data
staging_ai_companies_visualization,patent_visualization_data_with_by_year
Expand Down
60 changes: 60 additions & 0 deletions company_linkage/sql/highly_cited_ai_publications.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
WITH
ai_pubs AS (
SELECT
merged_id,
ror_id,
org_name,
country,
year
FROM
staging_ai_companies_visualization.ai_publications
),

citation_counts AS (
SELECT
DISTINCT ref_id AS merged_id,
COUNT(DISTINCT
REFERENCES
.merged_id) AS citation_count,
ror_id,
org_name,
country,
year
FROM
literature.references
INNER JOIN
ai_pubs
ON
ref_id = ai_pubs.merged_id
GROUP BY
ref_id,
ror_id,
org_name,
country,
year
),

get_top_cited AS (
SELECT
DISTINCT merged_id,
citation_count,
ror_id,
org_name,
country,
IF
(citation_count >= PERCENTILE_CONT(citation_count, 0.9) OVER(PARTITION BY year), TRUE, FALSE) AS top_cited,
year
FROM
citation_counts
)

SELECT
DISTINCT merged_id,
ror_id,
org_name,
country,
year
FROM
get_top_cited
WHERE
top_cited IS true
2 changes: 1 addition & 1 deletion company_linkage/sql/visualization_data_with_all_papers.sql
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ WITH
SELECT
*
FROM
staging_ai_companies_visualization.visualization_data_with_top_papers)
staging_ai_companies_visualization.visualization_data_with_highly_cited)
-- Join the two together using the CSET id
SELECT
viz_data.*,
Expand Down
27 changes: 27 additions & 0 deletions company_linkage/sql/visualization_data_with_highly_cited.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
-- Update the visualization table itself to add highly cited paper data
-- Pull in the highly cited paper counts, along with the CSET ids to link them in
WITH
count_data AS (
SELECT
CSET_id,
highly_cited_ai_pubs,
highly_cited_ai_pubs_by_year,
FROM
staging_ai_companies_visualization.highly_cited_paper_counts),
-- Pull in the current visualization data.
viz_data AS (
SELECT
*
FROM
staging_ai_companies_visualization.visualization_data_with_top_papers)
-- Join the two together using the CSET id
SELECT
viz_data.*,
highly_cited_ai_pubs,
highly_cited_ai_pubs_by_year,
FROM
viz_data
LEFT JOIN
count_data
ON
viz_data.CSET_id = count_data.CSET_id
Loading