Skip to content

Commit

Permalink
feat: Add custom parser - channelnewsasia.com (#44)
Browse files Browse the repository at this point in the history
  • Loading branch information
jocmp authored Jan 19, 2025
1 parent 6abdf5a commit 953d2d9
Show file tree
Hide file tree
Showing 5 changed files with 6,466 additions and 0 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Mercury Parser Changelog

### 2.3.1

- [ba949c0c87] - feat: Add custom parser - channelnewsasia.com (Josiah Campbell) [#44](https://github.com/jocmp/mercury-parser/pull/44)
- [6abdf5a862] - chore: Fix references to Postlight Parser in README (Josiah Campbell)

### 2.3.0 (Jan 15, 2025)

- [a45b329e0a] - fix: Update versants.com to parse figures (Josiah Campbell) [#42](https://github.com/jocmp/mercury-parser/pull/42)
Expand Down
6,338 changes: 6,338 additions & 0 deletions fixtures/www.channelnewsasia.com/1737324313613.html

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/extractors/custom/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,4 @@ export * from './techcrunch.com';
export * from './www.hardwarezone.com.sg';
export * from './www.spiegel.de';
export * from './mobilesyrup.com';
export * from './www.channelnewsasia.com';
36 changes: 36 additions & 0 deletions src/extractors/custom/www.channelnewsasia.com/index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
export const WwwChannelnewsasiaComExtractor = {
domain: 'www.channelnewsasia.com',

title: {
selectors: [['meta[name="og:title"]', 'value']],
},

author: {
selectors: [
'.link--author-profile',
['meta[name="cXenseParse:author"]', 'value'],
],
},

date_published: {
selectors: ['.article-publish:not(span)'],
format: 'DD MMM YYYY HH:mma',
timezone: 'Asia/Singapore',
},

dek: {
selectors: ['.content-detail__description'],
},

lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']],
},

content: {
selectors: ['section[data-title="Content"]'],

transforms: {},

clean: [],
},
};
86 changes: 86 additions & 0 deletions src/extractors/custom/www.channelnewsasia.com/index.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import assert from 'assert';
import URL from 'url';
import cheerio from 'cheerio';

import Parser from 'mercury';
import getExtractor from 'extractors/get-extractor';
import { excerptContent } from 'utils/text';

const fs = require('fs');

describe('WwwChannelnewsasiaComExtractor', () => {
describe('initial test case', () => {
let result;
let url;
beforeAll(() => {
url =
'https://www.channelnewsasia.com/singapore/police-arrest-suspects-ica-change-addresses-unauthorised-attempts-4869916';
const html = fs.readFileSync(
'./fixtures/www.channelnewsasia.com/1737324313613.html'
);
result = Parser.parse(url, { html, fallback: false });
});

it('is selected properly', () => {
const extractor = getExtractor(url);
assert.equal(extractor.domain, URL.parse(url).hostname);
});

it('returns the title', async () => {
const { title } = await result;

assert.equal(
title,
`Another six people arrested for unauthorised address changes using ICA online service`
);
});

it('returns the author', async () => {
const { author } = await result;

assert.equal(author, 'Daphne Yow');
});

it('returns the date_published', async () => {
const { date_published } = await result;

assert.equal(date_published, '2025-01-17T13:06:00.000Z');
});

it('returns the dek', async () => {
const { dek } = await result;

assert.equal(
dek,
'This brings the total number of people arrested so far to 13.'
);
});

it('returns the lead_image_url', async () => {
const { lead_image_url } = await result;

assert.equal(
lead_image_url,
`https://dam.mediacorp.sg/image/upload/s--WYyV_F7F--/c_crop,h_574,w_1021,x_4,y_1/f_auto,q_auto/c_fill,g_auto,h_676,w_1200/v1/mediacorp/cna/image/2022/11/17/ica3.jpg?itok=smLA3pam`
);
});

it('returns the content', async () => {
const { content } = await result;

const $ = cheerio.load(content || '');

const first13 = excerptContent(
$('*')
.first()
.text(),
13
);

assert.equal(
first13,
'SINGAPORE: The police have arrested another six people in relation to a series'
);
});
});
});

0 comments on commit 953d2d9

Please sign in to comment.