Fix/scraper elasticsearch insertion (#53)

bitcoinsearch · Oct 25, 2023 · ef697e5 · ef697e5
1 parent a1c9e81
commit ef697e5
Show file tree

Hide file tree

Showing 9 changed files with 580 additions and 212 deletions.
diff --git a/.env.sample b/.env.sample
@@ -1,4 +1,5 @@
-DATA_DIR=/tmp/chaincode-scrapper
+DATA_DIR=./tmp/chaincode-scrapper
+DAYS_TO_SUBTRACT = 15
 
 # For app search
 ES_URL=
@@ -8,5 +9,5 @@ ES_ENGINE=
 # For elasticsearch cluster
 CLOUD_ID=
 USERNAME=
-PASSWORD=
+USER_PASSWORD=
 INDEX=
diff --git a/.github/workflows/mailing-list-bitcoin.yml b/.github/workflows/mailing-list-bitcoin.yml
@@ -27,8 +27,7 @@ jobs:
             USER_PASSWORD: ${{ secrets.USER_PASSWORD }}
             USERNAME: ${{ secrets.USERNAME }}
             INDEX: ${{ secrets.INDEX }}
+            DAYS_TO_SUBTRACT: ${{ secrets.DAYS_TO_SUBTRACT }}
             DATA_DIR: /tmp/data
             URL: https://lists.linuxfoundation.org/pipermail/bitcoin-dev/
             NAME: bitcoin
-            START_MONTH: 5
-            START_YEAR: 2011
diff --git a/.github/workflows/mailing-list-lightning.yml b/.github/workflows/mailing-list-lightning.yml
@@ -27,8 +27,7 @@ jobs:
             USER_PASSWORD: ${{ secrets.USER_PASSWORD }}
             USERNAME: ${{ secrets.USERNAME }}
             INDEX: ${{ secrets.INDEX }}
+            DAYS_TO_SUBTRACT: ${{ secrets.DAYS_TO_SUBTRACT }}
             DATA_DIR: /tmp/data
             URL: https://lists.linuxfoundation.org/pipermail/lightning-dev/
             NAME: lightning
-            START_MONTH: 5
-            START_YEAR: 2015
diff --git a/.gitignore b/.gitignore
@@ -1,8 +1,143 @@
+.idea/
+
 node_modules
 __pycache__
 .vscode
+.vs
+tmp/
 yarn-error.log
 .env
 scrapybot/*.xml
 scrapybot/*.bz2
 scrapybot/*.json
+
+# Logs
+logs
+*.log
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+lerna-debug.log*
+.pnpm-debug.log*
+
+# Diagnostic reports (https://nodejs.org/api/report.html)
+report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
+
+# Runtime data
+pids
+*.pid
+*.seed
+*.pid.lock
+
+# Directory for instrumented libs generated by jscoverage/JSCover
+lib-cov
+
+# Coverage directory used by tools like istanbul
+coverage
+*.lcov
+
+# nyc test coverage
+.nyc_output
+
+# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
+.grunt
+
+# Bower dependency directory (https://bower.io/)
+bower_components
+
+# node-waf configuration
+.lock-wscript
+
+# Compiled binary addons (https://nodejs.org/api/addons.html)
+build/Release
+
+# Dependency directories
+node_modules/
+jspm_packages/
+
+# Snowpack dependency directory (https://snowpack.dev/)
+web_modules/
+
+# TypeScript cache
+*.tsbuildinfo
+
+# Optional npm cache directory
+.npm
+
+# Optional eslint cache
+.eslintcache
+
+# Optional stylelint cache
+.stylelintcache
+
+# Microbundle cache
+.rpt2_cache/
+.rts2_cache_cjs/
+.rts2_cache_es/
+.rts2_cache_umd/
+
+# Optional REPL history
+.node_repl_history
+
+# Output of 'npm pack'
+*.tgz
+
+# Yarn Integrity file
+.yarn-integrity
+
+# dotenv environment variable files
+.env
+.env.development.local
+.env.test.local
+.env.production.local
+.env.local
+
+# parcel-bundler cache (https://parceljs.org/)
+.cache
+.parcel-cache
+
+# Next.js build output
+.next
+out
+
+# Nuxt.js build / generate output
+.nuxt
+dist
+
+# Gatsby files
+.cache/
+# Comment in the public line in if your project uses Gatsby and not Next.js
+# https://nextjs.org/blog/next-9-1#public-directory-support
+# public
+
+# vuepress build output
+.vuepress/dist
+
+# vuepress v2.x temp and cache directory
+.temp
+.cache
+
+# Docusaurus cache and generated files
+.docusaurus
+
+# Serverless directories
+.serverless/
+
+# FuseBox cache
+.fusebox/
+
+# DynamoDB Local files
+.dynamodb/
+
+# TernJS port file
+.tern-port
+
+# Stores VSCode versions used for testing VSCode extensions
+.vscode-test
+
+# yarn v2
+.yarn/cache
+.yarn/unplugged
+.yarn/build-state.yml
+.yarn/install-state.gz
+.pnp.*
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ You should be calling the scrapers from the root dir because they use the common
 This section explains how to run the scrapers in `scrapybot` folder
 
 The folder has a bunch of crawlers(spiders) in the `scrapybot/scrapybot/spiders` folder. Each of the crawler files is specific to a particular site.
-To run a crawler using scrapybot, for example `rusty` ,which will scrape the site `https://rusty.ozlabs.org`,switch to the root directory(where there is this README file) and run these commands from your terminal:
+To run a crawler using scrapybot, for example `rusty`, which will scrape the site `https://rusty.ozlabs.org`,switch to the root directory(where there is this README file) and run these commands from your terminal:
 - `pip install -r requirements.txt && cd scrapybot`
 - ` scrapy crawl rusty -O rusty.json`
 

diff --git a/common/elasticsearch-scraper/util.js b/common/elasticsearch-scraper/util.js
@@ -1,11 +1,17 @@
-const { Client } = require("@elastic/elasticsearch");
+const {
+    Client
+} = require("@elastic/elasticsearch");
+
 function create_batches(objects, size) {
     const batches = [];
     for (let i = 0; i < objects.length; i += size) {
         const batch = [];
         for (let j = 0; j < size; j++) {
             if (objects[i + j]) { // Timestamp the object upload to strictly order it
-                const timestampedObj = {...objects[i+j], indexed_at: new Date().toISOString()}
+                const timestampedObj = {
+                    ...objects[i + j],
+                    indexed_at: new Date().toISOString()
+                }
                 batch.push(timestampedObj);
             }
         }
@@ -36,10 +42,14 @@ async function fetch_with_retry(url, options) {
 async function index_documents(documents) {
     let cloud_id = process.env.CLOUD_ID;
     let username = process.env.USERNAME;
-    let api_key = process.env.USER_PASSWORD;
+    let api_key = process.env.USER_PASSWORD
     const client = new Client({
-        cloud: { id: cloud_id },
-        auth: { apiKey: api_key }
+        cloud: {
+            id: cloud_id
+        },
+        auth: {
+            apiKey: api_key
+        }
     });
 
     const batches = create_batches(documents, 50);
@@ -52,8 +62,16 @@ async function index_documents(documents) {
 
         while (!success) {
             try {
-                const operations = batch.flatMap(doc => [{ index: { _index: process.env.INDEX } }, doc])
-                const bulkResponse = await client.bulk({ refresh: true, pipeline: "avoid-duplicates", operations })
+                const operations = batch.flatMap(doc => [{
+                    index: {
+                        _index: process.env.INDEX
+                    }
+                }, doc])
+                const bulkResponse = await client.bulk({
+                    refresh: true,
+                    pipeline: "avoid-duplicates",
+                    operations
+                })
                 console.log(bulkResponse);
 
                 success = true;
@@ -65,21 +83,21 @@ async function index_documents(documents) {
                     // The presence of the `error` key indicates that the operation
                     // that we did for the document has failed.
                     bulkResponse.items.forEach((action, i) => {
-                      const operation = Object.keys(action)[0]
-                      if (action[operation].error) {
-                        erroredDocuments.push({
-                          // If the status is 429 it means that you can retry the document,
-                          // otherwise it's very likely a mapping error, and you should
-                          // fix the document before to try it again.
-                          status: action[operation].status,
-                          error: action[operation].error,
-                          operation: operations[i * 2],
-                          document: operations[i * 2 + 1]
-                        })
-                      }
+                        const operation = Object.keys(action)[0]
+                        if (action[operation].error) {
+                            erroredDocuments.push({
+                                // If the status is 429 it means that you can retry the document,
+                                // otherwise it's very likely a mapping error, and you should
+                                // fix the document before to try it again.
+                                status: action[operation].status,
+                                error: action[operation].error,
+                                operation: operations[i * 2],
+                                document: operations[i * 2 + 1]
+                            })
+                        }
                     })
                     console.log(erroredDocuments);
-                  }
+                }
 
             } catch (e) {
                 console.log(e);
@@ -100,8 +118,40 @@ async function index_documents(documents) {
     console.log("Done");
 }
 
+
+async function checkDocumentExist(document_id) {
+    let cloud_id = process.env.CLOUD_ID;
+    let username = process.env.USERNAME;
+    let api_key = process.env.USER_PASSWORD
+    const client = new Client({
+        cloud: {
+            id: cloud_id
+        },
+        auth: {
+            apiKey: api_key
+        }
+    });
+
+    const bulkResponse = await client.count({
+        index: process.env.INDEX,
+        body: {
+            query: {
+                bool: {
+                    must: [{
+                        term: {
+                            "id.keyword": document_id
+                        }
+                    }]
+                }
+            }
+        }
+    });
+    return bulkResponse.count > 0;
+}
+
 module.exports = {
     create_batches,
     index_documents,
-    fetch_with_retry
+    fetch_with_retry,
+    checkDocumentExist,
 };