Gf2

GMOD · Dec 12, 2024 · 6a8438d · 6a8438d
1 parent 40a4b14
commit 6a8438d
Show file tree

Hide file tree

Showing 11 changed files with 197 additions and 288 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,7 @@
 ## [1.6.1](https://github.com/GMOD/tabix-js/compare/v1.6.0...v1.6.1) (2024-12-07)
 
-
-
 # [1.6.0](https://github.com/GMOD/tabix-js/compare/v1.5.15...v1.6.0) (2024-11-30)
 
-
-
 ## [1.5.15](https://github.com/GMOD/tabix-js/compare/v1.5.14...v1.5.15) (2024-08-30)
 
 ## [1.5.14](https://github.com/GMOD/tabix-js/compare/v1.5.13...v1.5.14) (2024-07-23)

diff --git a/package.json b/package.json
@@ -42,8 +42,8 @@
   ],
   "dependencies": {
     "@gmod/abortable-promise-cache": "^2.0.0",
-    "@gmod/bgzf-filehandle": "^1.3.3",
-    "generic-filehandle": "^3.0.0",
+    "@gmod/bgzf-filehandle": "^2.0.0",
+    "generic-filehandle2": "^0.0.1",
     "long": "^4.0.0",
     "quick-lru": "^4.0.0"
   },
@@ -55,16 +55,13 @@
     "@typescript-eslint/eslint-plugin": "^8.0.1",
     "@typescript-eslint/parser": "^8.0.1",
     "@vitest/coverage-v8": "^2.0.5",
-    "buffer": "^6.0.3",
     "documentation": "^14.0.3",
     "eslint": "^9.9.0",
-    "eslint-config-prettier": "^9.1.0",
-    "eslint-plugin-prettier": "^5.0.1",
     "eslint-plugin-unicorn": "^56.0.0",
     "prettier": "^3.3.3",
     "rimraf": "^6.0.1",
     "standard-changelog": "^6.0.0",
-    "typescript": "~5.6.0",
+    "typescript": "^5.7.0",
     "typescript-eslint": "^8.0.1",
     "vitest": "^2.0.5",
     "webpack": "^5.93.0",

diff --git a/src/csi.ts b/src/csi.ts
@@ -1,5 +1,4 @@
 import Long from 'long'
-import { Buffer } from 'buffer'
 import { unzip } from '@gmod/bgzf-filehandle'
 
 import VirtualOffset, { fromBytes } from './virtualOffset'
@@ -11,6 +10,12 @@ import IndexFile, { Options } from './indexFile'
 const CSI1_MAGIC = 21582659 // CSI\1
 const CSI2_MAGIC = 38359875 // CSI\2
 
+const formats = {
+  0: 'generic',
+  1: 'SAM',
+  2: 'VCF',
+}
+
 function lshift(num: number, bits: number) {
   return num * 2 ** bits
 }
@@ -49,26 +54,27 @@ export default class CSI extends IndexFile {
     throw new Error('CSI indexes do not support indexcov')
   }
 
-  parseAuxData(bytes: Buffer, offset: number) {
-    const formatFlags = bytes.readInt32LE(offset)
+  parseAuxData(bytes: Uint8Array, offset: number) {
+    const dataView = new DataView(bytes.buffer)
+    const formatFlags = dataView.getInt32(offset, true)
     const coordinateType =
       formatFlags & 0x10000 ? 'zero-based-half-open' : '1-based-closed'
-    const format = { 0: 'generic', 1: 'SAM', 2: 'VCF' }[formatFlags & 0xf]
+    const format = formats[(formatFlags & 0xf) as 0 | 1 | 2]
     if (!format) {
       throw new Error(`invalid Tabix preset format flags ${formatFlags}`)
     }
     const columnNumbers = {
-      ref: bytes.readInt32LE(offset + 4),
-      start: bytes.readInt32LE(offset + 8),
-      end: bytes.readInt32LE(offset + 12),
+      ref: dataView.getInt32(offset + 4, true),
+      start: dataView.getInt32(offset + 8, true),
+      end: dataView.getInt32(offset + 12, true),
     }
-    const metaValue = bytes.readInt32LE(offset + 16)
+    const metaValue = dataView.getInt32(offset + 16, true)
     const metaChar = metaValue ? String.fromCharCode(metaValue) : null
-    const skipLines = bytes.readInt32LE(offset + 20)
-    const nameSectionLength = bytes.readInt32LE(offset + 24)
+    const skipLines = dataView.getInt32(offset + 20, true)
+    const nameSectionLength = dataView.getInt32(offset + 24, true)
 
     const { refIdToName, refNameToId } = this._parseNameBytes(
-      bytes.slice(offset + 28, offset + 28 + nameSectionLength),
+      bytes.subarray(offset + 28, offset + 28 + nameSectionLength),
     )
 
     return {
@@ -82,47 +88,52 @@ export default class CSI extends IndexFile {
     }
   }
 
-  _parseNameBytes(namesBytes: Buffer) {
+  _parseNameBytes(namesBytes: Uint8Array) {
     let currRefId = 0
     let currNameStart = 0
     const refIdToName = []
     const refNameToId: Record<string, number> = {}
+    const decoder = new TextDecoder('utf8')
     for (let i = 0; i < namesBytes.length; i += 1) {
       if (!namesBytes[i]) {
         if (currNameStart < i) {
-          let refName = namesBytes.toString('utf8', currNameStart, i)
-          refName = this.renameRefSeq(refName)
+          const refName = this.renameRefSeq(
+            decoder.decode(namesBytes.subarray(currNameStart, i)),
+          )
           refIdToName[currRefId] = refName
           refNameToId[refName] = currRefId
         }
         currNameStart = i + 1
         currRefId += 1
       }
     }
-    return { refNameToId, refIdToName }
+    return {
+      refNameToId,
+      refIdToName,
+    }
   }
 
   // fetch and parse the index
 
   async _parse(opts: Options = {}) {
     const bytes = await unzip(await this.filehandle.readFile(opts))
+    const dataView = new DataView(bytes.buffer)
 
     // check TBI magic numbers
     let csiVersion
-    if (bytes.readUInt32LE(0) === CSI1_MAGIC) {
+    if (dataView.getUint32(0, true) === CSI1_MAGIC) {
       csiVersion = 1
-    } else if (bytes.readUInt32LE(0) === CSI2_MAGIC) {
+    } else if (dataView.getUint32(0, true) === CSI2_MAGIC) {
       csiVersion = 2
     } else {
       throw new Error('Not a CSI file')
-      // TODO: do we need to support big-endian CSI files?
     }
 
-    this.minShift = bytes.readInt32LE(4)
-    this.depth = bytes.readInt32LE(8)
+    this.minShift = dataView.getInt32(4, true)
+    this.depth = dataView.getInt32(8, true)
     this.maxBinNumber = ((1 << ((this.depth + 1) * 3)) - 1) / 7
     const maxRefLength = 2 ** (this.minShift + this.depth * 3)
-    const auxLength = bytes.readInt32LE(12)
+    const auxLength = dataView.getInt32(12, true)
     const aux =
       auxLength && auxLength >= 30
         ? this.parseAuxData(bytes, 16)
@@ -134,35 +145,33 @@ export default class CSI extends IndexFile {
             coordinateType: 'zero-based-half-open',
             format: 'generic',
           }
-    const refCount = bytes.readInt32LE(16 + auxLength)
+    const refCount = dataView.getInt32(16 + auxLength, true)
 
     // read the indexes for each reference sequence
     let firstDataLine: VirtualOffset | undefined
     let currOffset = 16 + auxLength + 4
     const indices = new Array(refCount).fill(0).map(() => {
-      // the binning index
-      const binCount = bytes.readInt32LE(currOffset)
+      const binCount = dataView.getInt32(currOffset, true)
       currOffset += 4
       const binIndex: Record<string, Chunk[]> = {}
-      let stats // < provided by parsing a pseudo-bin, if present
+      let stats
       for (let j = 0; j < binCount; j += 1) {
-        const bin = bytes.readUInt32LE(currOffset)
+        const bin = dataView.getUint32(currOffset, true)
         if (bin > this.maxBinNumber) {
-          // this is a fake bin that actually has stats information
-          // about the reference sequence in it
+          // this is a fake bin that actually has stats information about the
+          // reference sequence in it
           stats = this.parsePseudoBin(bytes, currOffset + 4)
           currOffset += 4 + 8 + 4 + 16 + 16
         } else {
           const loffset = fromBytes(bytes, currOffset + 4)
           firstDataLine = this._findFirstData(firstDataLine, loffset)
-          const chunkCount = bytes.readInt32LE(currOffset + 12)
+          const chunkCount = dataView.getInt32(currOffset + 12, true)
           currOffset += 16
           const chunks = new Array(chunkCount)
           for (let k = 0; k < chunkCount; k += 1) {
             const u = fromBytes(bytes, currOffset)
             const v = fromBytes(bytes, currOffset + 8)
             currOffset += 16
-            // this._findFirstData(data, u)
             chunks[k] = new Chunk(u, v, bin)
           }
           binIndex[bin] = chunks
@@ -186,14 +195,15 @@ export default class CSI extends IndexFile {
     }
   }
 
-  parsePseudoBin(bytes: Buffer, offset: number) {
-    const lineCount = longToNumber(
-      Long.fromBytesLE(
-        bytes.slice(offset + 28, offset + 36) as unknown as number[],
-        true,
+  parsePseudoBin(bytes: Uint8Array, offset: number) {
+    return {
+      lineCount: longToNumber(
+        Long.fromBytesLE(
+          bytes.subarray(offset + 28, offset + 36) as unknown as number[],
+          true,
+        ),
       ),
-    )
-    return { lineCount }
+    }
   }
 
   async blocksForRange(
@@ -216,9 +226,8 @@ export default class CSI extends IndexFile {
       return []
     }
 
-    // const { linearIndex, binIndex } = indexes
-
-    const overlappingBins = this.reg2bins(min, max) // List of bin #s that overlap min, max
+    // List of bin #s that overlap min, max
+    const overlappingBins = this.reg2bins(min, max)
     const chunks: Chunk[] = []
 
     // Find chunks in overlapping bins.  Leaf bins (< 4681) are not pruned

diff --git a/src/indexFile.ts b/src/indexFile.ts
@@ -1,4 +1,4 @@
-import { GenericFilehandle } from 'generic-filehandle'
+import { GenericFilehandle } from 'generic-filehandle2'
 import VirtualOffset from './virtualOffset'
 import Chunk from './chunk'
 

diff --git a/src/tabixIndexedFile.ts b/src/tabixIndexedFile.ts
@@ -1,7 +1,6 @@
 import AbortablePromiseCache from '@gmod/abortable-promise-cache'
 import LRU from 'quick-lru'
-import { Buffer } from 'buffer'
-import { GenericFilehandle, RemoteFile, LocalFile } from 'generic-filehandle'
+import { GenericFilehandle, RemoteFile, LocalFile } from 'generic-filehandle2'
 import { unzip, unzipChunkSlice } from '@gmod/bgzf-filehandle'
 import { checkAbortSignal } from './util'
 import IndexFile, { Options, IndexData } from './indexFile'
@@ -17,17 +16,14 @@ function isASCII(str: string) {
 
 type GetLinesCallback = (line: string, fileOffset: number) => void
 
-const decoder =
-  typeof TextDecoder !== 'undefined' ? new TextDecoder('utf8') : undefined
-
 interface GetLinesOpts {
   [key: string]: unknown
   signal?: AbortSignal
   lineCallback: GetLinesCallback
 }
 
 interface ReadChunk {
-  buffer: Buffer
+  buffer: Uint8Array
   cpositions: number[]
   dpositions: number[]
 }
@@ -196,6 +192,7 @@ export default class TabixIndexedFile {
 
     const chunks = await this.index.blocksForRange(refName, start, end, options)
     checkAbortSignal(signal)
+    const decoder = new TextDecoder('utf8')
 
     // now go through each chunk and parse and filter the lines out of it
     for (const c of chunks) {
@@ -209,11 +206,11 @@ export default class TabixIndexedFile {
       let blockStart = 0
       let pos = 0
 
-      const str = decoder?.decode(buffer) ?? buffer.toString()
       // fast path, Buffer is just ASCII chars and not gigantor, can be
       // converted to string and processed directly. if it is not ASCII or
       // gigantic (chrome max str len is 512Mb), we have to decode line by line
-      const strIsASCII = buffer.length < 500_000_000 && isASCII(str)
+      const str = decoder.decode(buffer)
+      const strIsASCII = isASCII(str)
       while (blockStart < str.length) {
         let line: string
         let n: number
@@ -224,12 +221,12 @@ export default class TabixIndexedFile {
           }
           line = str.slice(blockStart, n)
         } else {
-          n = buffer.indexOf('\n', blockStart)
+          n = buffer.indexOf('\n'.charCodeAt(0), blockStart)
           if (n === -1) {
             break
           }
           const b = buffer.slice(blockStart, n)
-          line = decoder?.decode(b) ?? b.toString()
+          line = decoder.decode(b)
         }
 
         // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
@@ -292,10 +289,10 @@ export default class TabixIndexedFile {
     checkAbortSignal(opts.signal)
 
     const maxFetch = (firstDataLine?.blockPosition || 0) + maxBlockSize
-    // TODO: what if we don't have a firstDataLine, and the header
-    // actually takes up more than one block? this case is not covered here
+    // TODO: what if we don't have a firstDataLine, and the header actually
+    // takes up more than one block? this case is not covered here
 
-    const buf = await this._readRegion(0, maxFetch, opts)
+    const buf = await this.filehandle.read(maxFetch, 0, opts)
     const bytes = await unzip(buf)
 
     // trim off lines after the last non-meta line
@@ -324,8 +321,9 @@ export default class TabixIndexedFile {
    * @returns {Promise} for a string
    */
   async getHeader(opts: Options = {}) {
+    const decoder = new TextDecoder('utf8')
     const bytes = await this.getHeaderBuffer(opts)
-    return bytes.toString('utf8')
+    return decoder.decode(bytes)
   }
 
   /**
@@ -492,32 +490,16 @@ export default class TabixIndexedFile {
     return this.index.lineCount(refName, opts)
   }
 
-  async _readRegion(pos: number, size: number, opts: Options = {}) {
-    const b = Buffer.alloc(size)
-    const { bytesRead, buffer } = await this.filehandle.read(
-      b,
-      0,
-      size,
-      pos,
-      opts,
-    )
-
-    return buffer.subarray(0, bytesRead)
-  }
-
   /**
    * read and uncompress the data in a chunk (composed of one or more
    * contiguous bgzip blocks) of the file
    */
   async readChunk(c: Chunk, opts: Options = {}) {
-    // fetch the uncompressed data, uncompress carefully a block at a time, and
-    // stop when done
-
-    const data = await this._readRegion(
-      c.minv.blockPosition,
+    const ret = await this.filehandle.read(
       c.fetchedSize(),
+      c.minv.blockPosition,
       opts,
     )
-    return unzipChunkSlice(data, c)
+    return unzipChunkSlice(ret, c)
   }
 }