From f6ec8d952821336438fb55557c9c4499ab9111fa Mon Sep 17 00:00:00 2001 From: Sid Date: Wed, 18 Dec 2024 10:22:45 +0100 Subject: [PATCH] init --- .gitignore | 64 +++++++++++++++++++++++++++++++++++++++++++++++ README.md | 39 +++++++++++++++++++++++++++++ runScraper.js | 18 ------------- scraper.js | 23 ++++++++++------- server.js | 42 +++++++++++++++---------------- src/runScraper.ts | 18 +++++++++++++ src/scraper.ts | 28 +++++++++++++++++++++ src/server.ts | 29 +++++++++++++++++++++ tsconfig.json | 14 +++++++++++ 9 files changed, 226 insertions(+), 49 deletions(-) create mode 100644 .gitignore create mode 100644 README.md delete mode 100644 runScraper.js create mode 100644 src/runScraper.ts create mode 100644 src/scraper.ts create mode 100644 src/server.ts create mode 100644 tsconfig.json diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b1a5d5e --- /dev/null +++ b/.gitignore @@ -0,0 +1,64 @@ +# Node.js +node_modules/ +npm-debug.log +yarn-error.log + +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage + +# nyc test coverage +.nyc_output + +# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# Bower dependency directory (https://bower.io/) +bower_components + +# Compiled binary addons (https://nodejs.org/api/addons.html) +build/Release + +# Dependency directories +jspm_packages/ + +# TypeScript cache +*.tsbuildinfo + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# dotenv environment variables file +.env + +# MacOS +.DS_Store + +# IDEs and editors +.idea/ +.vscode/ +*.sublime-workspace \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..7d69a9d --- /dev/null +++ b/README.md @@ -0,0 +1,39 @@ +# Price Scraper Project + +Ce projet utilise Playwright pour scraper le site Dealabs et récupérer le produit du jour. Les informations sont ensuite envoyées à un backend Node.js. Le scrapper peut être exécuté manuellement ou automatiquement toutes les 5 minutes grâce à une tâche CRON. + +## Prérequis + +- **Node.js** : Version 20.x ou supérieure est recommandée. +- **npm** : Assurez-vous que npm est installé avec Node.js. + +## Installation + +1. Clonez le dépôt : + + ````bash + git clone + cd scrap-and-back ``` + + ```` + +2. Installez les dépendances : + + ````bash + npm install ``` + + ```` + +3. Installez les dépendances de développement : + ````bash + npm install --save-dev typescript @types/node ``` + +## Configuration + +1. Assurez-vous que le fichier `tsconfig.json` est correctement configuré pour votre environnement. + +2. Modifiez les sélecteurs CSS dans `src/scraper.ts` si nécessaire pour cibler les éléments corrects sur Dealabs. + +## Compilation + +Compilez le projet TypeScript en JavaScript : diff --git a/runScraper.js b/runScraper.js deleted file mode 100644 index cf05232..0000000 --- a/runScraper.js +++ /dev/null @@ -1,18 +0,0 @@ -const scrapePrices = require('./scraper'); -const axios = require('axios'); - -async function runScraper() { - try { - const productOfTheDay = await scrapePrices(); - if (productOfTheDay) { - await axios.post('http://localhost:3000/prices', productOfTheDay); - console.log('Product of the day sent:', productOfTheDay); - } else { - console.log('No product of the day found.'); - } - } catch (error) { - console.error('Error running scraper:', error); - } -} - -runScraper(); \ No newline at end of file diff --git a/scraper.js b/scraper.js index 6e8b343..9177d98 100644 --- a/scraper.js +++ b/scraper.js @@ -1,27 +1,32 @@ const { chromium } = require('playwright') async function scrapePrices() { + console.log('Running scraper...') + const browser = await chromium.launch({ headless: false }) const page = await browser.newPage() await page.goto('https://www.dealabs.com') await page.getByRole('button', { name: 'Tout accepter' }).click() await page.getByRole('button', { name: 'Les + hot' }).click() - await page.pause() + await page.waitForTimeout(1000) + const productOfTheDay = await page.evaluate(() => { - const productElement = document.querySelector('.top-deal'); + const productElement = document.querySelector('article[id^="thread_"]') + + console.log('productElement', productElement) if (productElement) { - return { - title: productElement.querySelector('.thread-title').textContent.trim(), - price: productElement.querySelector('.thread-price').textContent.trim(), - link: productElement.querySelector('a').href - }; + return { + title: productElement.querySelector('.thread-title').textContent.trim(), + price: productElement.querySelector('.thread-price').textContent.trim(), + link: productElement.querySelector('a').href, + } } - return null; + return null }) await browser.close() + console.log('productOfTheDay', productOfTheDay) return productOfTheDay } -scrapePrices() module.exports = scrapePrices diff --git a/server.js b/server.js index 2fb0134..606690a 100644 --- a/server.js +++ b/server.js @@ -1,27 +1,25 @@ -const express = require('express'); -const bodyParser = require('body-parser'); -const cron = require('node-cron'); -const scrapePrices = require('./scraper'); -const axios = require('axios'); +const express = require('express') +const bodyParser = require('body-parser') +const cron = require('node-cron') +const scrapePrices = require('./scraper') +const axios = require('axios') -const app = express(); -app.use(bodyParser.json()); +const app = express() +app.use(bodyParser.json()) -app.post('/prices', (req, res) => { - const prices = req.body; - console.log('Received prices:', prices); - res.status(200).send('Prices received'); -}); +// app.post('/prices', (req, res) => { +// const prices = req.body +// console.log('Received prices:', prices) +// res.status(200).send('Prices received') +// }) -cron.schedule('*/5 * * * *', async () => { - console.log('Running scraper...'); - const prices = await scrapePrices(); - await axios.post('http://localhost:3000/prices', prices) - .then(response => console.log(response.data)) - .catch(error => console.error('Error sending prices:', error)); -}); +cron.schedule('* * * * *', async () => { + const prices = await scrapePrices() + console.log('prices', prices) +}) -const PORT = 3000; +const PORT = 3000 app.listen(PORT, () => { - console.log(`Server is running on port ${PORT}`); -}); \ No newline at end of file + scrapePrices() + console.log(`Server is running on port ${PORT}`) +}) diff --git a/src/runScraper.ts b/src/runScraper.ts new file mode 100644 index 0000000..2c8a26c --- /dev/null +++ b/src/runScraper.ts @@ -0,0 +1,18 @@ +import scrapePrices from './scraper'; +import axios from 'axios'; + +async function runScraper() { + try { + const productOfTheDay = await scrapePrices(); + if (productOfTheDay) { + await axios.post('http://localhost:3000/prices', productOfTheDay); + console.log('Product of the day sent:', productOfTheDay); + } else { + console.log('No product of the day found.'); + } + } catch (error) { + console.error('Error running scraper:', error); + } +} + +runScraper(); \ No newline at end of file diff --git a/src/scraper.ts b/src/scraper.ts new file mode 100644 index 0000000..9482784 --- /dev/null +++ b/src/scraper.ts @@ -0,0 +1,28 @@ +import { chromium } from 'playwright'; + +async function scrapePrices(): Promise<{ title: string; price: string; link: string } | null> { + const browser = await chromium.launch({ headless: false }); + const page = await browser.newPage(); + await page.goto('https://www.dealabs.com'); + + await page.getByRole('button', { name: 'Tout accepter' }).click(); + await page.getByRole('button', { name: 'Les + hot' }).click(); + await page.pause(); + const productOfTheDay = await page.evaluate(() => { + const productElement = document.querySelector('.top-deal'); + if (productElement) { + return { + title: (productElement.querySelector('.thread-title') as HTMLElement).textContent?.trim() || '', + price: (productElement.querySelector('.thread-price') as HTMLElement).textContent?.trim() || '', + link: (productElement.querySelector('a') as HTMLAnchorElement).href + }; + } + return null; + }); + + await browser.close(); + return productOfTheDay; +} + +scrapePrices(); +export default scrapePrices; \ No newline at end of file diff --git a/src/server.ts b/src/server.ts new file mode 100644 index 0000000..ec9d0f5 --- /dev/null +++ b/src/server.ts @@ -0,0 +1,29 @@ +import express from 'express'; +import bodyParser from 'body-parser'; +import cron from 'node-cron'; +import scrapePrices from './scraper'; +import axios from 'axios'; + +const app = express(); +app.use(bodyParser.json()); + +app.post('/prices', (req, res) => { + const prices = req.body; + console.log('Received prices:', prices); + res.status(200).send('Prices received'); +}); + +cron.schedule('*/5 * * * *', async () => { + console.log('Running scraper...'); + const prices = await scrapePrices(); + if (prices) { + await axios.post('http://localhost:3000/prices', prices) + .then(response => console.log(response.data)) + .catch(error => console.error('Error sending prices:', error)); + } +}); + +const PORT = 3000; +app.listen(PORT, () => { + console.log(`Server is running on port ${PORT}`); +}); \ No newline at end of file diff --git a/tsconfig.json b/tsconfig.json new file mode 100644 index 0000000..0e1e958 --- /dev/null +++ b/tsconfig.json @@ -0,0 +1,14 @@ +{ + "compilerOptions": { + "target": "ES6", + "module": "commonjs", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "outDir": "./dist", + "rootDir": "./src" + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"] +} \ No newline at end of file