Skip to content

Commit

Permalink
Merge pull request #26 from UTMediaCAT/#19-email-when-crawl-stops
Browse files Browse the repository at this point in the history
email when the crawl.js stops
  • Loading branch information
jacqueline-chan authored Mar 11, 2021
2 parents ae9c94d + 6968063 commit 1c4497e
Show file tree
Hide file tree
Showing 3 changed files with 86 additions and 8 deletions.
65 changes: 63 additions & 2 deletions newCrawler/crawl.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,21 @@
Use: "node crawl.js -l <url1> ..."
Output: link_title_list.json
*/
process.env.APIFY_MEMORY_MBYTES = 30720


process.env.APIFY_MEMORY_MBYTES = 2048 // 30720

// let appmetrics = require('appmetrics');
const Apify = require('apify');
const path = require('path');
var { Readability } = require('@mozilla/readability');
var JSDOM = require('jsdom').JSDOM;

//email
let nodemailer = require('nodemailer');

let {mailOptions, mailError} = require('./email')

const { v5: uuidv5 } = require('uuid');
const parse = require('csv-parse/lib/sync')
const { performance } = require('perf_hooks');
Expand Down Expand Up @@ -126,6 +134,14 @@ function parseCSV(file){
// console.log('[' + new Date(cpu.time) + '] CPU: ' + cpu.process);
// });

let transporter = nodemailer.createTransport({
service: 'gmail',
auth: {
user: '[email protected]',
pass: "DO NOT COMMIT THIS password"
}
});

Apify.main(async () => {
// Get the urls from the command line arguments.
var is_url = false;
Expand Down Expand Up @@ -208,6 +224,7 @@ Apify.main(async () => {
}
pseudoUrls.push(new Apify.PseudoUrl(pseudoDomain));
}
console.log('making results directory...')

// Create a directory to hold all the individual JSON files.
fs.mkdir(path.join(__dirname, 'results'), (err) => {
Expand Down Expand Up @@ -381,9 +398,13 @@ Apify.main(async () => {
// Run the crawler.

try {
console.log('running the crawler...\n')
await crawler.run();
await sendMail(mailOptions)

} catch(e){
console.log(e)
console.log(e)
await sendMail(mailOptions)
}

const t1 = performance.now();
Expand Down Expand Up @@ -416,7 +437,47 @@ Apify.main(async () => {
if (removeSelf)
fs.rmdirSync(dirPath);
};

console.log("removing apify storage")
rmDir('./apify_storage/', true);


});


function sendMail (mailOptions){
return new Promise(function (resolve, reject){
transporter.sendMail(mailOptions, (err, info) => {
if (err) {
console.log("error: ", err);
console.log("email could not be sent");
reject(err);
} else {
console.log(`Mail sent successfully!`);
resolve(info);
}
});
});

}

function rmDir (dirPath, removeSelf){
if (removeSelf === undefined)
removeSelf = true;
try {
var files = fs.readdirSync(dirPath);
} catch (e) {
// throw e
console.error(e);
}
if (files.length > 0)
for (let i = 0; i < files.length; i++) {
const filePath = path.join(dirPath, files[i]);
if (fs.statSync(filePath).isFile())
fs.unlinkSync(filePath);
else
rmDir(filePath);
}
if (removeSelf)
fs.rmdirSync(dirPath);
};
10 changes: 5 additions & 5 deletions newCrawler/crawlCheerio.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ var JSDOM = require('jsdom').JSDOM;
//email
let nodemailer = require('nodemailer');

let {mailOptions, mailError} = require('./email')
let {mailOptions, mailError, mailOptionsCheerio, mailErrorCheerio} = require('./email')

const { v5: uuidv5 } = require('uuid');
const parse = require('csv-parse/lib/sync')
Expand Down Expand Up @@ -487,11 +487,11 @@ Apify.main(async () => {
try {
console.log("running the cheerio crawler...\n")
await crawler.run();
await sendMail(mailOptions);
await sendMail(mailOptionsCheerio);

} catch(e){
console.error(e);
await sendMail(mailError);
await sendMail(mailErrorCheerio);
}

const t1 = performance.now();
Expand All @@ -510,9 +510,9 @@ Apify.main(async () => {
});


function sendMail (mailOptions){
function sendMail (mailOptionsCheerio){
return new Promise(function (resolve, reject){
transporter.sendMail(mailOptions, (err, info) => {
transporter.sendMail(mailOptionsCheerio, (err, info) => {
if (err) {
console.log("error: ", err);
console.log("email could not be sent");
Expand Down
19 changes: 18 additions & 1 deletion newCrawler/email.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,24 @@ let mailError = {
text: 'Email to let you know that the crawler has stopped crawling with an error'
};

let mailOptionsCheerio = {
from: myemail,
to: myemail,
subject: 'The Cheerio crawler has stopped',
text: 'Email to let you know that the crawler has stopped crawling'
};


let mailErrorCheerio = {
from: myemail,
to: myemail,
subject: 'The Cheerio crawler has stopped with an error',
text: 'Email to let you know that the crawler has stopped crawling with an error'
};

module.exports = {
mailOptions,
mailError
mailError,
mailOptionsCheerio,
mailErrorCheerio
};

0 comments on commit 1c4497e

Please sign in to comment.