Skip to content

Commit

Permalink
Improve deserialisation performance
Browse files Browse the repository at this point in the history
Use the lower-level array-construction primitives to avoid intermediate
allocations and perform much better at deserialisation.

On Cabal (which uses tar for the hackage index), we observed:
* Deserialisation of IntTries go from 1.5s to 200ms, with 10GB of allocations going down to roughly 600MB.
* StringTable deserialization go from 700ms to 50ms, with 4GB of allocations going down to 80MB.
  • Loading branch information
alt-romes committed Jun 11, 2024
1 parent 3c20909 commit ad4162b
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 50 deletions.
22 changes: 10 additions & 12 deletions Codec/Archive/Tar/Index/IntTrie.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
{-# LANGUAGE CPP, BangPatterns, PatternGuards #-}
{-# LANGUAGE DeriveDataTypeable, ScopedTypeVariables #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE UnboxedTuples #-}
{-# OPTIONS_HADDOCK hide #-}

module Codec.Archive.Tar.Index.IntTrie (
Expand Down Expand Up @@ -55,6 +57,10 @@ import Data.IntMap.Strict (IntMap)

import Data.List hiding (lookup, insert)
import Data.Function (on)
import GHC.IO

import Data.ByteString.Internal (accursedUnutterablePerformIO)
import Codec.Archive.Tar.Index.Utils

-- | A compact mapping from sequences of nats to nats.
--
Expand Down Expand Up @@ -338,19 +344,11 @@ deserialise bs
, let lenArr = readWord32BE bs 0
lenTotal = 4 + 4 * fromIntegral lenArr
, BS.length bs >= 4 + 4 * fromIntegral lenArr
, let !arr = A.array (0, lenArr-1)
[ (i, readWord32BE bs off)
| (i, off) <- zip [0..lenArr-1] [4,8 .. lenTotal - 4] ]
!bs' = BS.drop lenTotal bs
= Just (IntTrie arr, bs')
, let !bs_without_len = BS.unsafeDrop 4 bs
!bs_remaining = BS.unsafeDrop lenTotal bs
!arr = accursedUnutterablePerformIO $ beToLe lenArr bs_without_len
= Just (IntTrie arr, bs_remaining)

| otherwise
= Nothing

readWord32BE :: BS.ByteString -> Int -> Word32
readWord32BE bs i =
assert (i >= 0 && i+3 <= BS.length bs - 1) $
fromIntegral (BS.unsafeIndex bs (i + 0)) `shiftL` 24
+ fromIntegral (BS.unsafeIndex bs (i + 1)) `shiftL` 16
+ fromIntegral (BS.unsafeIndex bs (i + 2)) `shiftL` 8
+ fromIntegral (BS.unsafeIndex bs (i + 3))
13 changes: 3 additions & 10 deletions Codec/Archive/Tar/Index/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ import Codec.Archive.Tar.Read as Tar
import qualified Codec.Archive.Tar.Index.StringTable as StringTable
import Codec.Archive.Tar.Index.StringTable (StringTable, StringTableBuilder)
import qualified Codec.Archive.Tar.Index.IntTrie as IntTrie
import Codec.Archive.Tar.Index.Utils (readWord32BE)
import Codec.Archive.Tar.Index.IntTrie (IntTrie, IntTrieBuilder)
import Codec.Archive.Tar.PackAscii

Expand Down Expand Up @@ -497,27 +498,19 @@ deserialise bs
| let ver = readWord32BE bs 0
, ver == 1
= do let !finalOffset = readWord32BE bs 4
(stringTable, bs') <- StringTable.deserialiseV1 (BS.drop 8 bs)
(stringTable, bs') <- StringTable.deserialiseV1 (BS.unsafeDrop 8 bs)
(intTrie, bs'') <- IntTrie.deserialise bs'
return (TarIndex stringTable intTrie finalOffset, bs'')

| let ver = readWord32BE bs 0
, ver == 2
= do let !finalOffset = readWord32BE bs 4
(stringTable, bs') <- StringTable.deserialiseV2 (BS.drop 8 bs)
(stringTable, bs') <- StringTable.deserialiseV2 (BS.unsafeDrop 8 bs)
(intTrie, bs'') <- IntTrie.deserialise bs'
return (TarIndex stringTable intTrie finalOffset, bs'')

| otherwise = Nothing

readWord32BE :: BS.ByteString -> Int -> Word32
readWord32BE bs i =
assert (i >= 0 && i+3 <= BS.length bs - 1) $
fromIntegral (BS.unsafeIndex bs (i + 0)) `shiftL` 24
+ fromIntegral (BS.unsafeIndex bs (i + 1)) `shiftL` 16
+ fromIntegral (BS.unsafeIndex bs (i + 2)) `shiftL` 8
+ fromIntegral (BS.unsafeIndex bs (i + 3))

toStrict :: LBS.ByteString -> BS.ByteString
toStrict = LBS.toStrict

Expand Down
45 changes: 17 additions & 28 deletions Codec/Archive/Tar/Index/StringTable.hs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ import qualified Data.ByteString.Lazy as LBS
import Data.ByteString.Builder as BS
import Data.ByteString.Builder.Extra as BS (byteStringCopy)

import Data.ByteString.Internal (accursedUnutterablePerformIO)
import Unsafe.Coerce (unsafeCoerce)
import Codec.Archive.Tar.Index.Utils

-- | An efficient mapping from strings to a dense set of integers.
--
data StringTable id = StringTable
Expand Down Expand Up @@ -172,7 +176,7 @@ deserialiseV1 bs
lenArr = fromIntegral (readWord32BE bs 4)
lenTotal= 8 + lenStrs + 4 * lenArr
, BS.length bs >= lenTotal
, let strs = BS.take lenStrs (BS.drop 8 bs)
, let strs = BS.unsafeTake lenStrs (BS.unsafeDrop 8 bs)
arr = A.array (0, fromIntegral lenArr - 1)
[ (i, readWord32BE bs off)
| (i, off) <- zip [0 .. fromIntegral lenArr - 1]
Expand Down Expand Up @@ -200,35 +204,20 @@ deserialiseV2 bs
+ 4 * lenArr
+(4 * (lenArr - 1)) * 2 -- offsets array is 1 longer
, BS.length bs >= lenTotal
, let strs = BS.take lenStrs (BS.drop 8 bs)
offs = A.listArray (0, fromIntegral lenArr - 1)
[ readWord32BE bs off
| off <- offsets offsOff ]
, let strs = BS.unsafeTake lenStrs (BS.unsafeDrop 8 bs)
offs_bs = BS.unsafeDrop (8 + lenStrs) bs
ids_bs = BS.unsafeDrop (lenArr * 4) offs_bs
ixs_bs = BS.unsafeDrop ((lenArr - 1) * 4) ids_bs

offs = accursedUnutterablePerformIO $ beToLe (fromIntegral lenArr) offs_bs
-- the second two arrays are 1 shorter
ids = A.listArray (0, fromIntegral lenArr - 2)
[ readInt32BE bs off
| off <- offsets idsOff ]
ixs = A.listArray (0, fromIntegral lenArr - 2)
[ readInt32BE bs off
| off <- offsets ixsOff ]
offsOff = 8 + lenStrs
idsOff = offsOff + 4 * lenArr
ixsOff = idsOff + 4 * (lenArr-1)
offsets from = [from,from+4 .. from + 4 * (lenArr - 1)]
!stringTable = StringTable strs offs ids ixs
!bs' = BS.drop lenTotal bs
= Just (stringTable, bs')
ids = accursedUnutterablePerformIO $ beToLe (fromIntegral lenArr - 1) ids_bs
ixs = accursedUnutterablePerformIO $ beToLe (fromIntegral lenArr - 1) ixs_bs

!stringTable = StringTable strs (unsafeCoerce offs) (unsafeCoerce ids) (unsafeCoerce ixs)
!bs_left = BS.drop lenTotal bs
= Just (stringTable, bs_left)

| otherwise
= Nothing

readInt32BE :: BS.ByteString -> Int -> Int32
readInt32BE bs i = fromIntegral (readWord32BE bs i)

readWord32BE :: BS.ByteString -> Int -> Word32
readWord32BE bs i =
assert (i >= 0 && i+3 <= BS.length bs - 1) $
fromIntegral (BS.unsafeIndex bs (i + 0)) `shiftL` 24
+ fromIntegral (BS.unsafeIndex bs (i + 1)) `shiftL` 16
+ fromIntegral (BS.unsafeIndex bs (i + 2)) `shiftL` 8
+ fromIntegral (BS.unsafeIndex bs (i + 3))
39 changes: 39 additions & 0 deletions Codec/Archive/Tar/Index/Utils.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
module Codec.Archive.Tar.Index.Utils where

import Data.ByteString as BS
import Control.Exception (assert)

import Data.ByteString.Internal (ByteString(..), unsafeWithForeignPtr, accursedUnutterablePerformIO)
import GHC.Int (Int(..), Int32)
import GHC.Word (Word32(..), byteSwap32)
import Foreign.Storable (peek)
import GHC.Ptr (castPtr, plusPtr, Ptr)
import Data.Array.Base
import Data.Array.IO.Internals (unsafeFreezeIOUArray)

-- | Construct a `UArray Word32 Word32` from a ByteString of 32bit big endian
-- words.
beToLe :: Word32
-- ^ The total array length (the number of 32bit words in the array)
-> BS.ByteString
-- ^ The bytestring from which the UArray is constructed.
-- The content must start in the first byte! (i.e. the meta-data words
-- that shouldn't be part of the array must have been dropped already)
-> IO (UArray Word32 Word32)
beToLe lenArr (BS fptr _) = unsafeWithForeignPtr fptr $ \ptr -> do
let ptr' = castPtr ptr :: Ptr Word32
unsafeFreezeIOUArray =<<
newGenArray (0, lenArr - 1) (\offset -> do

Check failure on line 26 in Codec/Archive/Tar/Index/Utils.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

Variable not in scope:

Check failure on line 26 in Codec/Archive/Tar/Index/Utils.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

Variable not in scope:

Check failure on line 26 in Codec/Archive/Tar/Index/Utils.hs

View workflow job for this annotation

GitHub Actions / build (windows-latest, latest)

Variable not in scope:

Check failure on line 26 in Codec/Archive/Tar/Index/Utils.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

Variable not in scope:

Check failure on line 26 in Codec/Archive/Tar/Index/Utils.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

Variable not in scope:

Check failure on line 26 in Codec/Archive/Tar/Index/Utils.hs

View workflow job for this annotation

GitHub Actions / build (macOS-latest, latest)

Variable not in scope:
byteSwap32 <$> peek (ptr' `plusPtr` (fromIntegral offset * 4)))

readInt32BE :: BS.ByteString -> Int -> Int32
readInt32BE bs i = fromIntegral (readWord32BE bs i)
{-# INLINE readInt32BE #-}

readWord32BE :: BS.ByteString -> Int -> Word32
readWord32BE (BS fptr len) i =
assert (i >= 0 && i+3 <= len - 1) $
accursedUnutterablePerformIO $
unsafeWithForeignPtr fptr $ \ptr -> do
byteSwap32 <$> peek (castPtr (ptr `plusPtr` i))
{-# INLINE readWord32BE #-}
1 change: 1 addition & 0 deletions tar.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ library tar-internal
Codec.Archive.Tar.Index.StringTable
Codec.Archive.Tar.Index.IntTrie
Codec.Archive.Tar.Index.Internal
Codec.Archive.Tar.Index.Utils

other-extensions:
BangPatterns
Expand Down

0 comments on commit ad4162b

Please sign in to comment.