Skip to content

Commit

Permalink
Gf2
Browse files Browse the repository at this point in the history
  • Loading branch information
cmdcolin committed Dec 12, 2024
1 parent 40a4b14 commit 6a8438d
Show file tree
Hide file tree
Showing 11 changed files with 197 additions and 288 deletions.
4 changes: 0 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,7 @@
## [1.6.1](https://github.com/GMOD/tabix-js/compare/v1.6.0...v1.6.1) (2024-12-07)



# [1.6.0](https://github.com/GMOD/tabix-js/compare/v1.5.15...v1.6.0) (2024-11-30)



## [1.5.15](https://github.com/GMOD/tabix-js/compare/v1.5.14...v1.5.15) (2024-08-30)

## [1.5.14](https://github.com/GMOD/tabix-js/compare/v1.5.13...v1.5.14) (2024-07-23)
Expand Down
9 changes: 3 additions & 6 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
],
"dependencies": {
"@gmod/abortable-promise-cache": "^2.0.0",
"@gmod/bgzf-filehandle": "^1.3.3",
"generic-filehandle": "^3.0.0",
"@gmod/bgzf-filehandle": "^2.0.0",
"generic-filehandle2": "^0.0.1",
"long": "^4.0.0",
"quick-lru": "^4.0.0"
},
Expand All @@ -55,16 +55,13 @@
"@typescript-eslint/eslint-plugin": "^8.0.1",
"@typescript-eslint/parser": "^8.0.1",
"@vitest/coverage-v8": "^2.0.5",
"buffer": "^6.0.3",
"documentation": "^14.0.3",
"eslint": "^9.9.0",
"eslint-config-prettier": "^9.1.0",
"eslint-plugin-prettier": "^5.0.1",
"eslint-plugin-unicorn": "^56.0.0",
"prettier": "^3.3.3",
"rimraf": "^6.0.1",
"standard-changelog": "^6.0.0",
"typescript": "~5.6.0",
"typescript": "^5.7.0",
"typescript-eslint": "^8.0.1",
"vitest": "^2.0.5",
"webpack": "^5.93.0",
Expand Down
89 changes: 49 additions & 40 deletions src/csi.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import Long from 'long'
import { Buffer } from 'buffer'
import { unzip } from '@gmod/bgzf-filehandle'

import VirtualOffset, { fromBytes } from './virtualOffset'
Expand All @@ -11,6 +10,12 @@ import IndexFile, { Options } from './indexFile'
const CSI1_MAGIC = 21582659 // CSI\1
const CSI2_MAGIC = 38359875 // CSI\2

const formats = {
0: 'generic',
1: 'SAM',
2: 'VCF',
}

function lshift(num: number, bits: number) {
return num * 2 ** bits
}
Expand Down Expand Up @@ -49,26 +54,27 @@ export default class CSI extends IndexFile {
throw new Error('CSI indexes do not support indexcov')
}

parseAuxData(bytes: Buffer, offset: number) {
const formatFlags = bytes.readInt32LE(offset)
parseAuxData(bytes: Uint8Array, offset: number) {
const dataView = new DataView(bytes.buffer)
const formatFlags = dataView.getInt32(offset, true)
const coordinateType =
formatFlags & 0x10000 ? 'zero-based-half-open' : '1-based-closed'
const format = { 0: 'generic', 1: 'SAM', 2: 'VCF' }[formatFlags & 0xf]
const format = formats[(formatFlags & 0xf) as 0 | 1 | 2]
if (!format) {
throw new Error(`invalid Tabix preset format flags ${formatFlags}`)
}
const columnNumbers = {
ref: bytes.readInt32LE(offset + 4),
start: bytes.readInt32LE(offset + 8),
end: bytes.readInt32LE(offset + 12),
ref: dataView.getInt32(offset + 4, true),
start: dataView.getInt32(offset + 8, true),
end: dataView.getInt32(offset + 12, true),
}
const metaValue = bytes.readInt32LE(offset + 16)
const metaValue = dataView.getInt32(offset + 16, true)
const metaChar = metaValue ? String.fromCharCode(metaValue) : null
const skipLines = bytes.readInt32LE(offset + 20)
const nameSectionLength = bytes.readInt32LE(offset + 24)
const skipLines = dataView.getInt32(offset + 20, true)
const nameSectionLength = dataView.getInt32(offset + 24, true)

const { refIdToName, refNameToId } = this._parseNameBytes(
bytes.slice(offset + 28, offset + 28 + nameSectionLength),
bytes.subarray(offset + 28, offset + 28 + nameSectionLength),
)

return {
Expand All @@ -82,47 +88,52 @@ export default class CSI extends IndexFile {
}
}

_parseNameBytes(namesBytes: Buffer) {
_parseNameBytes(namesBytes: Uint8Array) {
let currRefId = 0
let currNameStart = 0
const refIdToName = []
const refNameToId: Record<string, number> = {}
const decoder = new TextDecoder('utf8')
for (let i = 0; i < namesBytes.length; i += 1) {
if (!namesBytes[i]) {
if (currNameStart < i) {
let refName = namesBytes.toString('utf8', currNameStart, i)
refName = this.renameRefSeq(refName)
const refName = this.renameRefSeq(
decoder.decode(namesBytes.subarray(currNameStart, i)),
)
refIdToName[currRefId] = refName
refNameToId[refName] = currRefId
}
currNameStart = i + 1
currRefId += 1
}
}
return { refNameToId, refIdToName }
return {
refNameToId,
refIdToName,
}
}

// fetch and parse the index

async _parse(opts: Options = {}) {
const bytes = await unzip(await this.filehandle.readFile(opts))
const dataView = new DataView(bytes.buffer)

// check TBI magic numbers
let csiVersion
if (bytes.readUInt32LE(0) === CSI1_MAGIC) {
if (dataView.getUint32(0, true) === CSI1_MAGIC) {
csiVersion = 1
} else if (bytes.readUInt32LE(0) === CSI2_MAGIC) {
} else if (dataView.getUint32(0, true) === CSI2_MAGIC) {
csiVersion = 2
} else {
throw new Error('Not a CSI file')
// TODO: do we need to support big-endian CSI files?
}

this.minShift = bytes.readInt32LE(4)
this.depth = bytes.readInt32LE(8)
this.minShift = dataView.getInt32(4, true)
this.depth = dataView.getInt32(8, true)
this.maxBinNumber = ((1 << ((this.depth + 1) * 3)) - 1) / 7
const maxRefLength = 2 ** (this.minShift + this.depth * 3)
const auxLength = bytes.readInt32LE(12)
const auxLength = dataView.getInt32(12, true)
const aux =
auxLength && auxLength >= 30
? this.parseAuxData(bytes, 16)
Expand All @@ -134,35 +145,33 @@ export default class CSI extends IndexFile {
coordinateType: 'zero-based-half-open',
format: 'generic',
}
const refCount = bytes.readInt32LE(16 + auxLength)
const refCount = dataView.getInt32(16 + auxLength, true)

// read the indexes for each reference sequence
let firstDataLine: VirtualOffset | undefined
let currOffset = 16 + auxLength + 4
const indices = new Array(refCount).fill(0).map(() => {
// the binning index
const binCount = bytes.readInt32LE(currOffset)
const binCount = dataView.getInt32(currOffset, true)
currOffset += 4
const binIndex: Record<string, Chunk[]> = {}
let stats // < provided by parsing a pseudo-bin, if present
let stats
for (let j = 0; j < binCount; j += 1) {
const bin = bytes.readUInt32LE(currOffset)
const bin = dataView.getUint32(currOffset, true)
if (bin > this.maxBinNumber) {
// this is a fake bin that actually has stats information
// about the reference sequence in it
// this is a fake bin that actually has stats information about the
// reference sequence in it
stats = this.parsePseudoBin(bytes, currOffset + 4)
currOffset += 4 + 8 + 4 + 16 + 16
} else {
const loffset = fromBytes(bytes, currOffset + 4)
firstDataLine = this._findFirstData(firstDataLine, loffset)
const chunkCount = bytes.readInt32LE(currOffset + 12)
const chunkCount = dataView.getInt32(currOffset + 12, true)
currOffset += 16
const chunks = new Array(chunkCount)
for (let k = 0; k < chunkCount; k += 1) {
const u = fromBytes(bytes, currOffset)
const v = fromBytes(bytes, currOffset + 8)
currOffset += 16
// this._findFirstData(data, u)
chunks[k] = new Chunk(u, v, bin)
}
binIndex[bin] = chunks
Expand All @@ -186,14 +195,15 @@ export default class CSI extends IndexFile {
}
}

parsePseudoBin(bytes: Buffer, offset: number) {
const lineCount = longToNumber(
Long.fromBytesLE(
bytes.slice(offset + 28, offset + 36) as unknown as number[],
true,
parsePseudoBin(bytes: Uint8Array, offset: number) {
return {
lineCount: longToNumber(
Long.fromBytesLE(
bytes.subarray(offset + 28, offset + 36) as unknown as number[],
true,
),
),
)
return { lineCount }
}
}

async blocksForRange(
Expand All @@ -216,9 +226,8 @@ export default class CSI extends IndexFile {
return []
}

// const { linearIndex, binIndex } = indexes

const overlappingBins = this.reg2bins(min, max) // List of bin #s that overlap min, max
// List of bin #s that overlap min, max
const overlappingBins = this.reg2bins(min, max)
const chunks: Chunk[] = []

// Find chunks in overlapping bins. Leaf bins (< 4681) are not pruned
Expand Down
2 changes: 1 addition & 1 deletion src/indexFile.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { GenericFilehandle } from 'generic-filehandle'
import { GenericFilehandle } from 'generic-filehandle2'
import VirtualOffset from './virtualOffset'
import Chunk from './chunk'

Expand Down
48 changes: 15 additions & 33 deletions src/tabixIndexedFile.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import AbortablePromiseCache from '@gmod/abortable-promise-cache'
import LRU from 'quick-lru'
import { Buffer } from 'buffer'
import { GenericFilehandle, RemoteFile, LocalFile } from 'generic-filehandle'
import { GenericFilehandle, RemoteFile, LocalFile } from 'generic-filehandle2'
import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle'
import { checkAbortSignal } from './util'
import IndexFile, { Options, IndexData } from './indexFile'
Expand All @@ -17,17 +16,14 @@ function isASCII(str: string) {

type GetLinesCallback = (line: string, fileOffset: number) => void

const decoder =
typeof TextDecoder !== 'undefined' ? new TextDecoder('utf8') : undefined

interface GetLinesOpts {
[key: string]: unknown
signal?: AbortSignal
lineCallback: GetLinesCallback
}

interface ReadChunk {
buffer: Buffer
buffer: Uint8Array
cpositions: number[]
dpositions: number[]
}
Expand Down Expand Up @@ -196,6 +192,7 @@ export default class TabixIndexedFile {

const chunks = await this.index.blocksForRange(refName, start, end, options)
checkAbortSignal(signal)
const decoder = new TextDecoder('utf8')

// now go through each chunk and parse and filter the lines out of it
for (const c of chunks) {
Expand All @@ -209,11 +206,11 @@ export default class TabixIndexedFile {
let blockStart = 0
let pos = 0

const str = decoder?.decode(buffer) ?? buffer.toString()
// fast path, Buffer is just ASCII chars and not gigantor, can be
// converted to string and processed directly. if it is not ASCII or
// gigantic (chrome max str len is 512Mb), we have to decode line by line
const strIsASCII = buffer.length < 500_000_000 && isASCII(str)
const str = decoder.decode(buffer)
const strIsASCII = isASCII(str)
while (blockStart < str.length) {
let line: string
let n: number
Expand All @@ -224,12 +221,12 @@ export default class TabixIndexedFile {
}
line = str.slice(blockStart, n)
} else {
n = buffer.indexOf('\n', blockStart)
n = buffer.indexOf('\n'.charCodeAt(0), blockStart)
if (n === -1) {
break
}
const b = buffer.slice(blockStart, n)
line = decoder?.decode(b) ?? b.toString()
line = decoder.decode(b)
}

// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
Expand Down Expand Up @@ -292,10 +289,10 @@ export default class TabixIndexedFile {
checkAbortSignal(opts.signal)

const maxFetch = (firstDataLine?.blockPosition || 0) + maxBlockSize
// TODO: what if we don't have a firstDataLine, and the header
// actually takes up more than one block? this case is not covered here
// TODO: what if we don't have a firstDataLine, and the header actually
// takes up more than one block? this case is not covered here

const buf = await this._readRegion(0, maxFetch, opts)
const buf = await this.filehandle.read(maxFetch, 0, opts)
const bytes = await unzip(buf)

// trim off lines after the last non-meta line
Expand Down Expand Up @@ -324,8 +321,9 @@ export default class TabixIndexedFile {
* @returns {Promise} for a string
*/
async getHeader(opts: Options = {}) {
const decoder = new TextDecoder('utf8')
const bytes = await this.getHeaderBuffer(opts)
return bytes.toString('utf8')
return decoder.decode(bytes)
}

/**
Expand Down Expand Up @@ -492,32 +490,16 @@ export default class TabixIndexedFile {
return this.index.lineCount(refName, opts)
}

async _readRegion(pos: number, size: number, opts: Options = {}) {
const b = Buffer.alloc(size)
const { bytesRead, buffer } = await this.filehandle.read(
b,
0,
size,
pos,
opts,
)

return buffer.subarray(0, bytesRead)
}

/**
* read and uncompress the data in a chunk (composed of one or more
* contiguous bgzip blocks) of the file
*/
async readChunk(c: Chunk, opts: Options = {}) {
// fetch the uncompressed data, uncompress carefully a block at a time, and
// stop when done

const data = await this._readRegion(
c.minv.blockPosition,
const ret = await this.filehandle.read(
c.fetchedSize(),
c.minv.blockPosition,
opts,
)
return unzipChunkSlice(data, c)
return unzipChunkSlice(ret, c)
}
}
Loading

0 comments on commit 6a8438d

Please sign in to comment.