Skip to content

Commit

Permalink
Fix/scraper elasticsearch insertion (#53)
Browse files Browse the repository at this point in the history
  • Loading branch information
urvishp80 authored Oct 25, 2023
1 parent a1c9e81 commit ef697e5
Show file tree
Hide file tree
Showing 9 changed files with 580 additions and 212 deletions.
5 changes: 3 additions & 2 deletions .env.sample
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
DATA_DIR=/tmp/chaincode-scrapper
DATA_DIR=./tmp/chaincode-scrapper
DAYS_TO_SUBTRACT = 15

# For app search
ES_URL=
Expand All @@ -8,5 +9,5 @@ ES_ENGINE=
# For elasticsearch cluster
CLOUD_ID=
USERNAME=
PASSWORD=
USER_PASSWORD=
INDEX=
3 changes: 1 addition & 2 deletions .github/workflows/mailing-list-bitcoin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ jobs:
USER_PASSWORD: ${{ secrets.USER_PASSWORD }}
USERNAME: ${{ secrets.USERNAME }}
INDEX: ${{ secrets.INDEX }}
DAYS_TO_SUBTRACT: ${{ secrets.DAYS_TO_SUBTRACT }}
DATA_DIR: /tmp/data
URL: https://lists.linuxfoundation.org/pipermail/bitcoin-dev/
NAME: bitcoin
START_MONTH: 5
START_YEAR: 2011
3 changes: 1 addition & 2 deletions .github/workflows/mailing-list-lightning.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,7 @@ jobs:
USER_PASSWORD: ${{ secrets.USER_PASSWORD }}
USERNAME: ${{ secrets.USERNAME }}
INDEX: ${{ secrets.INDEX }}
DAYS_TO_SUBTRACT: ${{ secrets.DAYS_TO_SUBTRACT }}
DATA_DIR: /tmp/data
URL: https://lists.linuxfoundation.org/pipermail/lightning-dev/
NAME: lightning
START_MONTH: 5
START_YEAR: 2015
135 changes: 135 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,143 @@
.idea/

node_modules
__pycache__
.vscode
.vs
tmp/
yarn-error.log
.env
scrapybot/*.xml
scrapybot/*.bz2
scrapybot/*.json

# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*
.pnpm-debug.log*

# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json

# Runtime data
pids
*.pid
*.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage
*.lcov

# nyc test coverage
.nyc_output

# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# Bower dependency directory (https://bower.io/)
bower_components

# node-waf configuration
.lock-wscript

# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release

# Dependency directories
node_modules/
jspm_packages/

# Snowpack dependency directory (https://snowpack.dev/)
web_modules/

# TypeScript cache
*.tsbuildinfo

# Optional npm cache directory
.npm

# Optional eslint cache
.eslintcache

# Optional stylelint cache
.stylelintcache

# Microbundle cache
.rpt2_cache/
.rts2_cache_cjs/
.rts2_cache_es/
.rts2_cache_umd/

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variable files
.env
.env.development.local
.env.test.local
.env.production.local
.env.local

# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache

# Next.js build output
.next
out

# Nuxt.js build / generate output
.nuxt
dist

# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public

# vuepress build output
.vuepress/dist

# vuepress v2.x temp and cache directory
.temp
.cache

# Docusaurus cache and generated files
.docusaurus

# Serverless directories
.serverless/

# FuseBox cache
.fusebox/

# DynamoDB Local files
.dynamodb/

# TernJS port file
.tern-port

# Stores VSCode versions used for testing VSCode extensions
.vscode-test

# yarn v2
.yarn/cache
.yarn/unplugged
.yarn/build-state.yml
.yarn/install-state.gz
.pnp.*
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ You should be calling the scrapers from the root dir because they use the common
This section explains how to run the scrapers in `scrapybot` folder

The folder has a bunch of crawlers(spiders) in the `scrapybot/scrapybot/spiders` folder. Each of the crawler files is specific to a particular site.
To run a crawler using scrapybot, for example `rusty` ,which will scrape the site `https://rusty.ozlabs.org`,switch to the root directory(where there is this README file) and run these commands from your terminal:
To run a crawler using scrapybot, for example `rusty`, which will scrape the site `https://rusty.ozlabs.org`,switch to the root directory(where there is this README file) and run these commands from your terminal:
- `pip install -r requirements.txt && cd scrapybot`
- ` scrapy crawl rusty -O rusty.json`

Expand Down
92 changes: 71 additions & 21 deletions common/elasticsearch-scraper/util.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
const { Client } = require("@elastic/elasticsearch");
const {
Client
} = require("@elastic/elasticsearch");

function create_batches(objects, size) {
const batches = [];
for (let i = 0; i < objects.length; i += size) {
const batch = [];
for (let j = 0; j < size; j++) {
if (objects[i + j]) { // Timestamp the object upload to strictly order it
const timestampedObj = {...objects[i+j], indexed_at: new Date().toISOString()}
const timestampedObj = {
...objects[i + j],
indexed_at: new Date().toISOString()
}
batch.push(timestampedObj);
}
}
Expand Down Expand Up @@ -36,10 +42,14 @@ async function fetch_with_retry(url, options) {
async function index_documents(documents) {
let cloud_id = process.env.CLOUD_ID;
let username = process.env.USERNAME;
let api_key = process.env.USER_PASSWORD;
let api_key = process.env.USER_PASSWORD
const client = new Client({
cloud: { id: cloud_id },
auth: { apiKey: api_key }
cloud: {
id: cloud_id
},
auth: {
apiKey: api_key
}
});

const batches = create_batches(documents, 50);
Expand All @@ -52,8 +62,16 @@ async function index_documents(documents) {

while (!success) {
try {
const operations = batch.flatMap(doc => [{ index: { _index: process.env.INDEX } }, doc])
const bulkResponse = await client.bulk({ refresh: true, pipeline: "avoid-duplicates", operations })
const operations = batch.flatMap(doc => [{
index: {
_index: process.env.INDEX
}
}, doc])
const bulkResponse = await client.bulk({
refresh: true,
pipeline: "avoid-duplicates",
operations
})
console.log(bulkResponse);

success = true;
Expand All @@ -65,21 +83,21 @@ async function index_documents(documents) {
// The presence of the `error` key indicates that the operation
// that we did for the document has failed.
bulkResponse.items.forEach((action, i) => {
const operation = Object.keys(action)[0]
if (action[operation].error) {
erroredDocuments.push({
// If the status is 429 it means that you can retry the document,
// otherwise it's very likely a mapping error, and you should
// fix the document before to try it again.
status: action[operation].status,
error: action[operation].error,
operation: operations[i * 2],
document: operations[i * 2 + 1]
})
}
const operation = Object.keys(action)[0]
if (action[operation].error) {
erroredDocuments.push({
// If the status is 429 it means that you can retry the document,
// otherwise it's very likely a mapping error, and you should
// fix the document before to try it again.
status: action[operation].status,
error: action[operation].error,
operation: operations[i * 2],
document: operations[i * 2 + 1]
})
}
})
console.log(erroredDocuments);
}
}

} catch (e) {
console.log(e);
Expand All @@ -100,8 +118,40 @@ async function index_documents(documents) {
console.log("Done");
}


async function checkDocumentExist(document_id) {
let cloud_id = process.env.CLOUD_ID;
let username = process.env.USERNAME;
let api_key = process.env.USER_PASSWORD
const client = new Client({
cloud: {
id: cloud_id
},
auth: {
apiKey: api_key
}
});

const bulkResponse = await client.count({
index: process.env.INDEX,
body: {
query: {
bool: {
must: [{
term: {
"id.keyword": document_id
}
}]
}
}
}
});
return bulkResponse.count > 0;
}

module.exports = {
create_batches,
index_documents,
fetch_with_retry
fetch_with_retry,
checkDocumentExist,
};
Loading

0 comments on commit ef697e5

Please sign in to comment.