-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
116 lines (95 loc) · 3.06 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
require("dotenv").config();
const { writeFileSync } = require("fs");
const fetch = require("node-fetch");
const { stripHtml } = require("string-strip-html");
const metascraper = require("metascraper");
const jsonexport = require("jsonexport");
// api key
const apiKey = process.env.DISQUS_API_KEY || "";
// keywords to look for
const keywords = ["feedback", "request", "preprint", "comment", "public"];
// disqus apis
const postsApi = "https://disqus.com/api/3.0/forums/listPosts";
// disqus "forum" names for bio/medrxiv
const forums = ["biorxivstage", "medrxiv"];
async function getComments() {
// collect all comments from bio/medrxiv
let items = [];
for (const forum of forums) {
// set search params
const params = new URLSearchParams();
params.set("api_key", apiKey);
params.set("forum", forum);
params.set("related", "thread");
params.set("limit", 100);
// hard limit request pages
for (let page = 0; page < 10; page++) {
console.log(`Getting page ${page + 1} of comments from ${forum}`);
// get page of results
const url = postsApi + "?" + params.toString();
const { cursor, response } = await (await fetch(url)).json();
// collect comments
items = items.concat(response);
// set next page
if (cursor?.hasNext) params.set("cursor", cursor?.next);
else break;
}
}
console.log(`Found ${items.length} total comments`);
// keep only comment properties we want
items = items.map(
({
url = "",
forum = "",
message = "",
createdAt = null,
author = {},
}) => ({
url,
forum,
keywords: keywords.filter((keyword) =>
message.toLowerCase().includes(keyword)
).length,
message: stripHtml(message).result,
date: createdAt,
username: author.username,
name: author.name,
})
);
// sort by date
items = items.sort((a, b) => new Date(a) - new Date(b));
// keep only last few
items = items.slice(0, 100);
// create rules for metascraper to extract metadata from html
const toRule = (field) => [
({ htmlDom }) =>
Array.from(
htmlDom(`meta[name="${field}"]`).map(
(index, node) => node.attribs["content"]
)
).join(", "),
];
const rules = {
doi: toRule("citation_doi"),
title: toRule("citation_title"),
authors: toRule("citation_author"),
date: toRule("citation_date"),
};
// get paper metadata from links
items = await Promise.all(
items.map(async ({ url, ...rest }, index) => {
console.log(`Getting paper metadata of comment ${index + 1}`);
// fetch html content of link
const html = await (await fetch(url)).text();
// extract out metadata from html
const metadata = await metascraper([rules])({ html, url });
// split object into comment info and paper info
return { comment: { url, ...rest }, paper: metadata };
})
);
// output to csv file
const csv = await jsonexport(items, { headerPathString: " " });
writeFileSync("output.csv", csv, "utf-8");
}
// run main script
getComments();