Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: update scripts to reflect new fixture structure #736

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions dist/generate-custom-parser.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion dist/generate-custom-parser.js.map

Large diffs are not rendered by default.

12 changes: 6 additions & 6 deletions dist/mercury.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion dist/mercury.js.map

Large diffs are not rendered by default.

5 changes: 0 additions & 5 deletions scripts/find-and-replace.sh

This file was deleted.

4 changes: 1 addition & 3 deletions scripts/generate-custom-parser.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,6 @@ function scaffoldCustomParser(url) {
if (!fs.existsSync(dir)) {
newParser = true;
confirmCreateDir(dir, `Creating ${hostname} directory`);
confirmCreateDir(`./fixtures/${hostname}`, 'Creating fixtures directory');
}

confirm(Parser.fetchResource, [url], 'Fetching fixture', newParser);
Expand Down Expand Up @@ -99,8 +98,7 @@ function savePage($, [url], newParser) {

spinner.succeed();

const filename = new Date().getTime();
const file = `./fixtures/${hostname}/${filename}.html`;
const file = `./fixtures/${hostname}.html`;
// fix http(s) relative links:
makeLinksAbsolute($('*').first(), $, url);
$('[src], [href]').each((index, node) => {
Expand Down
235 changes: 66 additions & 169 deletions scripts/update-fixtures.js
Original file line number Diff line number Diff line change
@@ -1,184 +1,81 @@
/* eslint-disable */

const { execFile, execFileSync } = require('child_process');
const fs = require('fs');
const fsPromises = require('fs/promises');
const path = require('path');
const URL = require('url');
const octokit = require('@octokit/rest')();

const Parser = require('../dist/mercury');

// get all fixtures
execFile('find', ['fixtures', '-type', 'f'], (err, stdout) => {
const fixtures = stdout.split('\n');

const now = new Date();
const twoWeeks = 2 * 7 * 24 * 60 * 60 * 1000;

// iterate through fixtures for fixtures older than 2 weeks
console.log('Finding fixtures to update...');
const fixturesToUpdate = fixtures
.filter(fixture => {
const timestamp = path
.basename(fixture)
.split(/\.html$/)[0]
.trim();
try {
const date = new Date(parseInt(timestamp, 10));
return now - date > twoWeeks;
} catch (e) {
// if fixture isn't a timestamp, ignore it
return false;
}
const FIXTURES_PATH = path.join(__dirname, '..', 'fixtures');

const perform = async () => {
const fixtures = (await fsPromises.readdir(FIXTURES_PATH)).filter(f =>
f.match(/\.html$/)
);

const TODAY = new Date();
const TWO_WEEKS_AGO = new Date(TODAY.setDate(TODAY.getDate() - 14));

console.log('Finding fixtures to update…');
const fixturesToUpdate = (await Promise.all(
fixtures.map(async filename => {
const stats = await fsPromises.stat(path.join(FIXTURES_PATH, filename));
return [filename, stats.mtime];
})
.slice(0, 1);
))
.filter(([_filename, timestamp]) => timestamp <= TWO_WEEKS_AGO)
.map(([filename, _timestamp]) => filename);
console.log(`${fixturesToUpdate.length} fixtures are out of date`);

// iterate through fixtures and extract their URLs.
console.log('Extracting urls...');
const baseDomains = fixturesToUpdate.map(fixture => fixture.split('/')[1]);
Promise.all(
fixturesToUpdate.map((fixture, i) => {
const html = fs.readFileSync(fixture);
return Parser.parse(`http://${baseDomains[i]}`, { html });
})
).then(parsedFixture => {
const fixturesAndUrls = fixturesToUpdate.reduce(
(acc, fixture, i) =>
acc.concat({
fixture,
url: parsedFixture[i].url,
baseDomain: baseDomains[i],
}),
[]
);

console.log('Updating all fixtures');
const fns = fixturesAndUrls
.map(fixtureAndUrl => {
return () => {
// console.log('Updating fixture for', fixtureAndUrl);
return updateFixture(fixtureAndUrl);
};
})
.concat(() => {
return new Promise(res => {
console.log('changed bases', changeBase);
console.log(`otherMess`, otherMess);
res();
});
});
promiseSerial(fns);
});
});

const changeBase = [];
const otherMess = [];
const updateFixture = ({ fixture, url, baseDomain }) => {
return new Promise(res => {
Parser.parse(url)
.then(({ url: updatedUrl }) => {
if (!updatedUrl) {
otherMess.push({ updatedUrl, url, fixture, baseDomain });
return res();
}
console.log(`updatedUrl`, updatedUrl);
const { hostname } = URL.parse(updatedUrl);
if (hostname !== baseDomain) {
console.log('Base URL has changed!!! Do something different');
console.log(`url`, url);
console.log(`updatedUrl`, updatedUrl);
console.log(`hostname`, hostname);
changeBase.push({
fixture,
url,
baseDomain,
newBaseDomain: hostname,
updatedUrl,
});
return res();
}
execFile('yarn', ['generate-parser', url], (err, stdout) => {
// console.log(`stdout`, stdout);
const dirRe = new RegExp(`(${path.dirname(fixture)}\/\\d+\.html)`);
const newFixture = stdout.match(dirRe)[0];

console.log(`newFixture`, newFixture);
// replace old fixture with new fixture in tests
execFile(
'./scripts/find-and-replace.sh',
[fixture, newFixture, 'src/extractors/custom/**/*.test.js'],
(err, stdout) => {
// remove old fixture
fs.unlinkSync(fixture);
const { branchName, commitMessage } = doTestsPass(baseDomain)
? {
branchName: `chore-update-${baseDomain}-fixture`,
commitMessage: `chore: update ${baseDomain} fixture`,
}
: {
branchName: `fix-update-${baseDomain}-extractor`,
commitMessage: `fix: update ${baseDomain} extractor`,
};

createAndPushBranch({ branchName, commitMessage });
createPR({ branchName, title: commitMessage });
}
);
const changeBase = [];
const otherMess = [];

console.log('Updating all fixtures');
for (const filename of fixturesToUpdate) {
const fixturePath = path.join(FIXTURES_PATH, filename);
const baseDomain = filename.replace(/(?:--[a-z-]+)?\.html$/, '');
const oldHtml = await fsPromises.readFile(fixturePath);
const { url } = await Parser.parse(`http://${baseDomain}`, {
html: oldHtml,
});

console.log(`Updating fixture for ${baseDomain} (${url})`);
try {
const { url: updatedUrl } = await Parser.parse(url);

if (!updatedUrl) {
otherMess.push({ updatedUrl, url, filename, baseDomain });
continue;
}

const { hostname } = URL.parse(updatedUrl);

if (hostname !== baseDomain) {
console.log(
`Base URL has changed from ${baseDomain} to ${hostname}, passing`
);

changeBase.push({
filename,
url,
baseDomain,
newBaseDomain: hostname,
updatedUrl,
});
})
.catch(e => {
otherMess.push({ fixture, url, baseDomain, e });
});
});
};

const doTestsPass = site => {
try {
execFileSync('yarn', ['test:node', site]);
return true;
} catch (e) {
return false;
}
};
continue;
}

const promiseSerial = funcs =>
funcs.reduce(
(promise, func) =>
promise.then(result => func().then(Array.prototype.concat.bind(result))),
Promise.resolve([])
);
const $ = await Parser.fetchResource(updatedUrl);
const newHtml = $.html();

const createAndPushBranch = ({ branchName, commitMessage }) => {
execFileSync('git', [
'config',
'user.email',
'[email protected]',
]);
execFileSync('git', ['config', 'user.name', 'Postlight Bot']);
execFileSync('git', ['checkout', '-b', branchName]);
execFileSync('git', ['add', '.']);
execFileSync('git', ['commit', '-m', commitMessage]);
execFileSync('git', [
'push',
'-q',
`https://${process.env.GH_AUTH_TOKEN}@github.com/postlight/parser.git`,
]);
};
await fsPromises.writeFile(fixturePath, newHtml);
} catch (e) {
console.log('Fixture update failed to parse', e);
}
}

const createPR = ({ branchName, title, body = '' }) => {
octokit.authenticate({
type: 'token',
token: process.env.GH_AUTH_TOKEN,
});

octokit.pulls.create({
owner: 'postlight',
repo: 'parser',
title,
head: branchName,
base: 'master',
body,
maintainer_can_modify: true,
});
console.log('changed bases', changeBase);
console.log('other mess', otherMess);
};

perform();