Skip to content

Commit

Permalink
Script to concatenate MD files (#471)
Browse files Browse the repository at this point in the history
* Script to concatenate MD files

* Remove front matter and replace with title H1

* Filter out tutorials

* Filter out glossary

* Generate file and store on S3

* Change file name to allPageSourceFiles.md

* Faster performance by not processing every line

* Convert to ES6

* Process to HTML with Remark

* Strip markdown instead of processing to HTML

* This is a text file now

* Fix underscore escaping

* Send tutorials to separate file
  • Loading branch information
timothymcmackin authored Nov 27, 2024
1 parent c2cd472 commit ca332a0
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/deploy-production.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ jobs:
node-version: 18
- run: npm ci
- run: npm run build
- run: npm run concat
- run: mv allPageSourceFiles.txt allTutorials.txt ./build/
- uses: aws-actions/configure-aws-credentials@v2
with:
aws-access-key-id: ${{ secrets.AWS_S3_PRODUCTION_KEY_ID }}
Expand Down
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ node_modules
out
build
_glossaryBuild
allPageSourceFiles.txt
allTutorials.txt
31 changes: 31 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"docusaurus": "docusaurus",
"start": "docusaurus start",
"build": "node ./src/scripts/process_downloaded_glossary.js && docusaurus build",
"concat": "node ./src/scripts/concatenate.mjs",
"swizzle": "docusaurus swizzle",
"deploy": "docusaurus deploy",
"clear": "docusaurus clear",
Expand Down Expand Up @@ -41,8 +42,10 @@
"minimist": "1.2.8",
"mocha": "10.2.0",
"rehype-stringify": "10.0.0",
"remark": "^15.0.1",
"remark-parse": "11.0.0",
"remark-rehype": "11.0.0",
"strip-markdown": "^6.0.0",
"unified": "11.0.4",
"unist-util-visit": "5.0.0"
},
Expand Down
134 changes: 134 additions & 0 deletions src/scripts/concatenate.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import path from 'path';
import fs from 'fs';

import {remark} from 'remark'
import strip from 'strip-markdown';

import { dirname } from 'node:path';
import { fileURLToPath } from 'node:url';

const __dirname = dirname(fileURLToPath(import.meta.url));

import sidebars from '../../sidebars.js';
// Map sidebars to output file names
const fileNames = {
documentationSidebar: 'allPageSourceFiles.txt',
tutorialsSidebar: 'allTutorials.txt',
}
const pathsToFilterOut = [
'overview/glossary',
];

// Given a docusaurus sidebar object, return a list of the local doc IDs in it
function getIdsRecursive(sidebarObject) {
if (typeof sidebarObject === 'string') {
return sidebarObject;
}
if (sidebarObject.constructor.name == "Array") {
return sidebarObject.reduce((list, oneSidebarObj) =>
list.concat(getIdsRecursive(oneSidebarObj)),
[]);
}
switch(sidebarObject.type) {
case 'category':
return [sidebarObject?.link?.id].concat(getIdsRecursive(sidebarObject.items));
case 'doc':
return sidebarObject.id;
case 'link':
if (sidebarObject.href && sidebarObject.href.startsWith('http')) {
return null;
} else {
return sidebarObject.href;
}
default:
return null;
}
}

// Given a doc file ID from the sidebar, get the filename path
async function getFilePath(fileId) {
const mdPath = path.resolve(__dirname, '../../docs', fileId) + '.md';
try {
await fs.promises.access(mdPath, fs.constants.F_OK);
return mdPath;
} catch {
// Do nothing
}
const mdxPath = mdPath + 'x';
try {
await fs.promises.access(mdxPath, fs.constants.F_OK);
return mdxPath;
} catch {
console.error("Could not file file with sidebar ID", fileId);
}
}

// Remove the front matter from an MD file and replace with an H1
// Got to remove FM because multiple FM blocks break some markdown tools
// Could do this with gray-matter but I don't want to add the dependency
function removeFrontMatter(mdText) {
const lines = mdText.split('\n');
let inFrontMatter = false;
let doneWithFrontMatter = false;
const h1Regex = /^title:\s+(.*)$/;
let titleLine = '';
let line = '';

while (lines.length > 0) {
line = lines.shift();
if (line == '---') {
doneWithFrontMatter = inFrontMatter;
inFrontMatter = true;
}
if (inFrontMatter && !doneWithFrontMatter && h1Regex.test(line)) {
const result = h1Regex.exec(line);
titleLine = '# ' + result[1];
}
if (line != '---' && doneWithFrontMatter) {
return [titleLine, ''].concat(lines).join('\n');
}
}
}

async function concatSidebar(sidebarName) {
const outputPath = path.resolve(__dirname, '../../', fileNames[sidebarName]);

// Remove old concatenated file if it exists
try {
await fs.promises.access(outputPath, fs.constants.F_OK);
await fs.promises.unlink(outputPath);
} catch {
// Do nothing because the file does not exist
}

const allMdIds = getIdsRecursive(sidebars[sidebarName])
.filter((id) => !pathsToFilterOut.includes(id))
.filter((item) => item);

// Find the matching file paths
const allFilePaths = await Promise.all(allMdIds.map(getFilePath));

// Read and concat the files in TOC order
await allFilePaths.reduce(async (previousPromise, oneFilePath) => {
await previousPromise;
const markdownText = removeFrontMatter(await fs.promises.readFile(oneFilePath, 'utf8'));
const oneFileText = await remark()
.use(strip)
.process(markdownText);
// Fix strip plugin escaping `_` as `\_`
const oneFileTextFixEscaped = String(oneFileText).replaceAll('\\\_', '_');
return fs.promises.appendFile(outputPath, oneFileTextFixEscaped + '\n\n');
}, Promise.resolve());

console.log(`Wrote concatenated file for sidebar ${sidebarName} to ${outputPath}`);
}

// Concatenate the sidebars listed in fileNames to separate single files
async function concatSidebars() {
const sidebarNames = Object.keys(sidebars);
await Promise.all(sidebarNames
.filter((oneSidebarName) => Object.keys(fileNames).includes(oneSidebarName))
.map(concatSidebar));
}

concatSidebars();

0 comments on commit ca332a0

Please sign in to comment.