Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small optimizations for large GWAS type data #148

Merged
merged 4 commits into from
Nov 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions eslint.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,17 @@ export default tseslint.config(
},
],

'no-underscore-dangle': 0,
'no-console': [
'warn',
{
allow: ['error', 'warn'],
},
],
'no-underscore-dangle': 'off',
curly: 'error',
'@typescript-eslint/no-explicit-any': 0,
'@typescript-eslint/explicit-module-boundary-types': 0,
'@typescript-eslint/ban-ts-comment': 0,
'@typescript-eslint/no-explicit-any': 'off',
'@typescript-eslint/explicit-module-boundary-types': 'off',
'@typescript-eslint/ban-ts-comment': 'off',
semi: ['error', 'never'],
'unicorn/no-new-array': 'off',
'unicorn/no-empty-file': 'off',
Expand Down
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@
"docs": "documentation readme --shallow src/tabixIndexedFile.ts --section TabixIndexedFile",
"clean": "rimraf dist esm",
"prebuild": "npm run clean && npm run lint",
"build:esm": "tsc --target es2018 --outDir esm",
"build:es5": "tsc --target es2015 --module commonjs --outDir dist",
"build:esm": "tsc --outDir esm",
"build:es5": "tsc --module commonjs --outDir dist",
"build": "npm run build:esm && npm run build:es5",
"postbuild": "webpack",
"preversion": "npm run lint && npm test run && npm run build",
Expand Down Expand Up @@ -60,7 +60,7 @@
"eslint": "^9.9.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-prettier": "^5.0.1",
"eslint-plugin-unicorn": "^55.0.0",
"eslint-plugin-unicorn": "^56.0.0",
"prettier": "^3.3.3",
"rimraf": "^6.0.1",
"standard-changelog": "^6.0.0",
Expand Down
122 changes: 64 additions & 58 deletions src/tabixIndexedFile.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@
import TBI from './tbi'
import CSI from './csi'

function isASCII(str: string) {
return /^[\u0000-\u007F]*$/.test(str)

Check failure on line 14 in src/tabixIndexedFile.ts

View workflow job for this annotation

GitHub Actions / Lint, build, and test on node 20.x and ubuntu-latest

Unexpected control character(s) in regular expression: \x00
}

type GetLinesCallback = (line: string, fileOffset: number) => void

const decoder =
Expand All @@ -27,13 +31,9 @@
dpositions: number[]
}

function timeout(time: number) {
return new Promise(resolve => setTimeout(resolve, time))
}
export default class TabixIndexedFile {
private filehandle: GenericFilehandle
private index: IndexFile
private yieldTime: number
private renameRefSeq: (n: string) => string
private chunkCache: AbortablePromiseCache<Chunk, ReadChunk>

Expand All @@ -58,9 +58,6 @@
*
* @param {tbiUrl} [args.tbiUrl]
*
* @param {number} [args.yieldTime] yield to main thread after N milliseconds
* if reading features is taking a long time to avoid hanging main thread
*
* @param {function} [args.renameRefSeqs] optional function with sig `string
* => string` to transform reference sequence names for the purpose of
* indexing and querying. note that the data that is returned is not altered,
Expand All @@ -76,7 +73,6 @@
csiPath,
csiUrl,
csiFilehandle,
yieldTime = 500,
renameRefSeqs = n => n,
chunkCacheSize = 5 * 2 ** 20,
}: {
Expand All @@ -89,7 +85,6 @@
csiPath?: string
csiUrl?: string
csiFilehandle?: GenericFilehandle
yieldTime?: number
renameRefSeqs?: (n: string) => string
chunkCacheSize?: number
}) {
Expand Down Expand Up @@ -147,7 +142,6 @@
}

this.renameRefSeq = renameRefSeqs
this.yieldTime = yieldTime
this.chunkCache = new AbortablePromiseCache<Chunk, ReadChunk>({
cache: new LRU({ maxSize: Math.floor(chunkCacheSize / (1 << 16)) }),
fill: (args: Chunk, signal?: AbortSignal) =>
Expand Down Expand Up @@ -203,9 +197,7 @@
checkAbortSignal(signal)

// now go through each chunk and parse and filter the lines out of it
let last = Date.now()
for (const c of chunks) {
let previousStartCoordinate: number | undefined
const { buffer, cpositions, dpositions } = await this.chunkCache.get(
c.toString(),
c,
Expand All @@ -215,13 +207,29 @@
checkAbortSignal(signal)
let blockStart = 0
let pos = 0
while (blockStart < buffer.length) {
const n = buffer.indexOf('\n', blockStart)
if (n === -1) {
break

const str = decoder?.decode(buffer) ?? buffer.toString()
// fast path, Buffer is just ASCII chars and not gigantor, can be
// converted to string and processed directly. if it is not ASCII or
// gigantic (chrome max str len is 512Mb), we have to decode line by line
const strIsASCII = buffer.length < 500_000_000 && isASCII(str)
while (blockStart < str.length) {
let line: string
let n: number
if (strIsASCII) {
n = str.indexOf('\n', blockStart)
if (n === -1) {
break
}
line = str.slice(blockStart, n)
} else {
n = buffer.indexOf('\n', blockStart)
if (n === -1) {
break
}
const b = buffer.slice(blockStart, n)
line = decoder?.decode(b) ?? b.toString()
}
const b = buffer.slice(blockStart, n)
const line = decoder?.decode(b) ?? b.toString()

// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
if (dpositions) {
Expand All @@ -238,48 +246,31 @@
line,
)

// do a small check just to make sure that the lines are really sorted
// by start coordinate
if (
previousStartCoordinate !== undefined &&
startCoordinate !== undefined &&
previousStartCoordinate > startCoordinate
) {
throw new Error(
`Lines not sorted by start coordinate (${previousStartCoordinate} > ${startCoordinate}), this file is not usable with Tabix.`,
)
}
previousStartCoordinate = startCoordinate

if (overlaps) {
callback(
line.trim(),
// cpositions[pos] refers to actual file offset of a bgzip block boundaries
line,
// cpositions[pos] refers to actual file offset of a bgzip block
// boundaries
//
// we multiply by (1 <<8) in order to make sure each block has a "unique"
// address space so that data in that block could never overlap
// we multiply by (1 <<8) in order to make sure each block has a
// "unique" address space so that data in that block could never
// overlap
//
// then the blockStart-dpositions is an uncompressed file offset from
// that bgzip block boundary, and since the cpositions are multiplied by
// (1 << 8) these uncompressed offsets get a unique space
// then the blockStart-dpositions is an uncompressed file offset
// from that bgzip block boundary, and since the cpositions are
// multiplied by (1 << 8) these uncompressed offsets get a unique
// space
cpositions[pos]! * (1 << 8) +
(blockStart - dpositions[pos]!) +
c.minv.dataPosition +
1,
)
} else if (startCoordinate !== undefined && startCoordinate >= end) {
// the lines were overlapping the region, but now have stopped, so
// we must be at the end of the relevant data and we can stop
// processing data now
// the lines were overlapping the region, but now have stopped, so we
// must be at the end of the relevant data and we can stop processing
// data now
return
}

// yield if we have emitted beyond the yield limit
if (this.yieldTime && last - Date.now() > this.yieldTime) {
last = Date.now()
checkAbortSignal(signal)
await timeout(1)
}
blockStart = n + 1
}
}
Expand All @@ -296,6 +287,7 @@
async getHeaderBuffer(opts: Options = {}) {
const { firstDataLine, metaChar, maxBlockSize } =
await this.getMetadata(opts)

checkAbortSignal(opts.signal)

// eslint-disable-next-line @typescript-eslint/restrict-plus-operands
Expand All @@ -320,7 +312,7 @@
lastNewline = i
}
}
return bytes.slice(0, lastNewline + 1)
return bytes.subarray(0, lastNewline + 1)
}
return bytes
}
Expand Down Expand Up @@ -397,14 +389,17 @@
let currentColumnStart = 0
let refSeq = ''
let startCoordinate = -Infinity
for (let i = 0; i < line.length + 1; i += 1) {
if (line[i] === '\t' || i === line.length) {
const l = line.length
for (let i = 0; i < l + 1; i++) {
if (line[i] === '\t' || i === l) {
if (currentColumnNumber === ref) {
if (
this.renameRefSeq(line.slice(currentColumnStart, i)) !==
regionRefName
) {
return { overlaps: false }
return {
overlaps: false,
}
}
} else if (currentColumnNumber === start) {
startCoordinate = parseInt(line.slice(currentColumnStart, i), 10)
Expand All @@ -413,12 +408,18 @@
startCoordinate -= 1
}
if (startCoordinate >= regionEnd) {
return { startCoordinate, overlaps: false }
return {
startCoordinate,
overlaps: false,
}
}
if (end === 0 || end === start) {
// if we have no end, we assume the feature is 1 bp long
if (startCoordinate + 1 <= regionStart) {
return { startCoordinate, overlaps: false }
return {
startCoordinate,
overlaps: false,
}
}
}
} else if (format === 'VCF' && currentColumnNumber === 4) {
Expand All @@ -432,9 +433,11 @@
refSeq,
line.slice(currentColumnStart, i),
)
: parseInt(line.slice(currentColumnStart, i), 10)
: Number.parseInt(line.slice(currentColumnStart, i), 10)
if (endCoordinate <= regionStart) {
return { overlaps: false }
return {
overlaps: false,
}
}
}
currentColumnStart = i + 1
Expand All @@ -444,7 +447,10 @@
}
}
}
return { startCoordinate, overlaps: true }
return {
startCoordinate,
overlaps: true,
}
}

_getVcfEnd(startCoordinate: number, refSeq: string, info: any) {
Expand Down Expand Up @@ -496,7 +502,7 @@
opts,
)

return buffer.slice(0, bytesRead)
return buffer.subarray(0, bytesRead)
}

/**
Expand Down
2 changes: 1 addition & 1 deletion tsconfig.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"compilerOptions": {
"moduleResolution": "node",
"lib": ["es2017", "es7", "es6", "dom"],
"target": "es2018",
"declaration": true,
"noUncheckedIndexedAccess": true,
"outDir": "dist",
Expand Down
Loading
Loading