Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid per-byte loop in cstring{,Utf8} builders #569

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
21 changes: 14 additions & 7 deletions Data/ByteString/Builder.hs
Original file line number Diff line number Diff line change
Expand Up @@ -263,12 +263,14 @@ import qualified Data.ByteString.Builder.Prim as P
import qualified Data.ByteString.Lazy.Internal as L
import Data.ByteString.Builder.ASCII
import Data.ByteString.Builder.RealFloat
import Data.ByteString.Internal (byteCountLiteral)

import Data.String (IsString(..))
import System.IO (Handle, IOMode(..), withBinaryFile)
import Foreign
import GHC.Base (unpackCString#, unpackCStringUtf8#,
unpackFoldrCString#, build)
import GHC.Ptr (Ptr(..))

-- | Execute a 'Builder' and return the generated chunks as a lazy 'L.ByteString'.
-- The work is performed lazy, i.e., only when a chunk of the lazy 'L.ByteString'
Expand Down Expand Up @@ -440,18 +442,20 @@ char8 :: Char -> Builder
char8 = P.primFixed P.char8

-- | Char8 encode a 'String'.
{-# INLINE [1] string8 #-} -- phased to allow P.cstring rewrite
{-# NOINLINE string8 #-}
string8 :: String -> Builder
string8 = P.primMapListFixed P.char8

-- GHC desugars string literals with unpackCString# which the simplifier tends
-- to promptly turn into build (unpackFoldrCString# s), so we match on both.
{-# RULES
"string8/unpackCString#" forall s.
string8 (unpackCString# s) = P.cstring s
string8 (unpackCString# s) =
asciiLiteralCopy (Ptr s) (byteCountLiteral s)

"string8/unpackFoldrCString#" forall s.
string8 (build (unpackFoldrCString# s)) = P.cstring s
string8 (build (unpackFoldrCString# s)) =
asciiLiteralCopy (Ptr s) (byteCountLiteral s)
#-}

------------------------------------------------------------------------------
Expand All @@ -467,19 +471,22 @@ charUtf8 = P.primBounded P.charUtf8
--
-- Note that 'stringUtf8' performs no codepoint validation and consequently may
-- emit invalid UTF-8 if asked (e.g. single surrogates).
{-# INLINE [1] stringUtf8 #-} -- phased to allow P.cstring rewrite
{-# NOINLINE stringUtf8 #-}
stringUtf8 :: String -> Builder
stringUtf8 = P.primMapListBounded P.charUtf8

{-# RULES
"stringUtf8/unpackCStringUtf8#" forall s.
stringUtf8 (unpackCStringUtf8# s) = P.cstringUtf8 s
stringUtf8 (unpackCStringUtf8# s) =
modUtf8LitCopy (Ptr s) (byteCountLiteral s)

"stringUtf8/unpackCString#" forall s.
stringUtf8 (unpackCString# s) = P.cstring s
stringUtf8 (unpackCString# s) =
asciiLiteralCopy (Ptr s) (byteCountLiteral s)

"stringUtf8/unpackFoldrCString#" forall s.
stringUtf8 (build (unpackFoldrCString# s)) = P.cstring s
stringUtf8 (build (unpackFoldrCString# s)) =
asciiLiteralCopy (Ptr s) (byteCountLiteral s)
#-}

instance IsString Builder where
Expand Down
76 changes: 75 additions & 1 deletion Data/ByteString/Builder/Internal.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{-# LANGUAGE ScopedTypeVariables, CPP, BangPatterns, RankNTypes, TupleSections #-}
{-# LANGUAGE Unsafe #-}
{-# LANGUAGE MagicHash, ViewPatterns, Unsafe #-}
{-# OPTIONS_HADDOCK not-home #-}
-- | Copyright : (c) 2010 - 2011 Simon Meier
-- License : BSD3-style (see LICENSE)
Expand Down Expand Up @@ -84,6 +84,8 @@ module Data.ByteString.Builder.Internal (
-- , sizedChunksInsert

, byteStringCopy
, asciiLiteralCopy
, modUtf8LitCopy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
, modUtf8LitCopy
, modUtf8LiteralCopy

For consistency with asciiLiteralCopy (or we might as well chose to use Lit for both)

, byteStringInsert
, byteStringThreshold

Expand Down Expand Up @@ -127,6 +129,7 @@ module Data.ByteString.Builder.Internal (
) where

import Control.Arrow (second)
import Control.Monad (when)

import Data.Semigroup (Semigroup(..))

Expand All @@ -138,10 +141,12 @@ import qualified Data.ByteString.Short.Internal as Sh
import qualified GHC.IO.Buffer as IO (Buffer(..), newByteBuffer)
import GHC.IO.Handle.Internals (wantWritableHandle, flushWriteBuffer)
import GHC.IO.Handle.Types (Handle__, haByteBuffer, haBufferMode)
import GHC.Ptr (Ptr(..))
import System.IO (hFlush, BufferMode(..), Handle)
import Data.IORef

import Foreign
import Foreign.C.String (CString)
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
import System.IO.Unsafe (unsafeDupablePerformIO)

Expand Down Expand Up @@ -874,6 +879,75 @@ byteStringInsert :: S.ByteString -> Builder
byteStringInsert =
\bs -> builder $ \k (BufferRange op _) -> return $ insertChunk op bs k


------------------------------------------------------------------------------
-- Raw CString encoding
------------------------------------------------------------------------------

-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII
-- strings that are free of embedded (overlong-encoded as the two-byte sequence
-- @0xC0 0x80@) null characters.
--
-- @since 0.11.5.0
clyring marked this conversation as resolved.
Show resolved Hide resolved
clyring marked this conversation as resolved.
Show resolved Hide resolved
{-# INLINABLE asciiLiteralCopy #-}
asciiLiteralCopy :: Ptr Word8 -> Int -> Builder
asciiLiteralCopy = \ !ip !len -> builder $ \k br -> do
let !ipe = ip `plusPtr` len
wrappedBytesCopyStep (BufferRange ip ipe) k br

-- | GHC represents @NUL@ in string literals via an overlong 2-byte encoding,
-- which is part of "modified UTF-8" (GHC does not also implement CESU-8).
modifiedUtf8NUL :: CString
modifiedUtf8NUL = Ptr "\xc0\x80"#
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
modifiedUtf8NUL = Ptr "\xc0\x80"#
modUtf8NUL = Ptr "\xc0\x80"#

Let's keep the prefix consistent.


-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
-- encoded strings that may contain embedded overlong-encodings (as the
-- two-byte sequence @0xC0 0x80@) of null characters.
--
-- @since 0.11.5.0
{-# INLINABLE modUtf8LitCopy #-}
modUtf8LitCopy :: Ptr Word8 -> Int -> Builder
modUtf8LitCopy = \ !ip !len -> builder $ \k br -> do
nullAt <- c_strstr (castPtr ip) modifiedUtf8NUL
modUtf8_step ip len nullAt k br

modUtf8_step :: Ptr Word8 -> Int -> Ptr Word8 -> BuildStep r -> BuildStep r
modUtf8_step !ip !len ((== nullPtr) -> True) k br =
-- Contains no encoded nulls, use simple copy codepath
wrappedBytesCopyStep (BufferRange ip ipe) k br
where
!ipe = ip `plusPtr` len
modUtf8_step !ip !len !nullAt k (BufferRange op0 ope)
-- Copy as much of the null-free portion of the string as fits into the
-- available buffer space. If the string is long enough, we may have asked
-- for less than its full length, filling the buffer with the rest will go
-- into the next builder step.
| avail > nullFree = do
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please check with hpc that tests provide sufficient coverage of all cases here? (Sorry, I'm AFK and cannot check myself)

when (nullFree > 0) (copyBytes op0 ip nullFree)
clyring marked this conversation as resolved.
Show resolved Hide resolved
pokeElemOff op0 nullFree 0
let used = nullFree + 2
len' = len - used
!ip' = ip `plusPtr` used
!op' = op0 `plusPtr` (nullFree + 1)
nullAt' <- c_strstr ip' modifiedUtf8NUL
modUtf8_step ip' len' nullAt' k (BufferRange op' ope)
| avail > 0 = do
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question, but also avail == 0 should be a very rare case.

-- avail <= nullFree
copyBytes op0 ip avail
let len' = len - avail
!ip' = ip `plusPtr` avail
!op' = op0 `plusPtr` avail
return $ bufferFull 1 op' (modUtf8_step ip' len' nullAt k)
| otherwise =
return $ bufferFull 1 op0 (modUtf8_step ip len nullAt k)
where
!avail = ope `minusPtr` op0
!nullFree = nullAt `minusPtr` ip

foreign import ccall unsafe "string.h strstr" c_strstr
:: CString -> CString -> IO (Ptr Word8)


-- Short bytestrings
------------------------------------------------------------------------------

Expand Down
58 changes: 12 additions & 46 deletions Data/ByteString/Builder/Prim.hs
Original file line number Diff line number Diff line change
Expand Up @@ -663,59 +663,25 @@ primMapLazyByteStringBounded w =
L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty


------------------------------------------------------------------------------
-- Raw CString encoding
------------------------------------------------------------------------------

-- | A null-terminated ASCII encoded 'Foreign.C.String.CString'.
-- Null characters are not representable.
-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII
-- strings that are free of embedded (overlong-encoded as the two-byte sequence
-- @0xC0 0x80@) null characters.
--
-- @since 0.11.0.0
{-# DEPRECATED cstring "Use asciiLiteralCopy instead" #-}
cstring :: Addr# -> Builder
cstring =
\addr0 -> builder $ step addr0
where
step :: Addr# -> BuildStep r -> BuildStep r
step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
| W8# ch == 0 = k br
| op0 == ope =
return $ bufferFull 1 op0 (step addr k)
| otherwise = do
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 1#) k br'
where
!ch = indexWord8OffAddr# addr 0#
cstring s = asciiLiteralCopy (Ptr s) (S.byteCountLiteral s)
{-# INLINE cstring #-}

-- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'.
-- Null characters can be encoded as @0xc0 0x80@.
-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
-- encoded strings that may contain embedded overlong-encodings (as the
-- two-byte sequence @0xC0 0x80@) of null characters.
--
-- @since 0.11.0.0
{-# DEPRECATED cstringUtf8 "Use modUtf8LitCopy instead" #-}
cstringUtf8 :: Addr# -> Builder
cstringUtf8 =
\addr0 -> builder $ step addr0
where
step :: Addr# -> BuildStep r -> BuildStep r
step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
| W8# ch == 0 = k br
| op0 == ope =
return $ bufferFull 1 op0 (step addr k)
-- NULL is encoded as 0xc0 0x80
| W8# ch == 0xc0
, W8# (indexWord8OffAddr# addr 1#) == 0x80 = do
let !(W8# nullByte#) = 0
IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 2#) k br'
| otherwise = do
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 1#) k br'
where
!ch = indexWord8OffAddr# addr 0#
cstringUtf8 s = modUtf8LitCopy (Ptr s) (S.byteCountLiteral s)
{-# INLINE cstringUtf8 #-}

------------------------------------------------------------------------------
-- Char8 encoding
Expand Down
2 changes: 1 addition & 1 deletion Data/ByteString/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ module Data.ByteString.Internal (
unpackBytes, unpackAppendBytesLazy, unpackAppendBytesStrict,
unpackChars, unpackAppendCharsLazy, unpackAppendCharsStrict,
unsafePackAddress, unsafePackLenAddress,
unsafePackLiteral, unsafePackLenLiteral,
unsafePackLiteral, unsafePackLenLiteral, byteCountLiteral,

-- * Low level imperative construction
empty,
Expand Down
19 changes: 14 additions & 5 deletions Data/ByteString/Internal/Type.hs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ module Data.ByteString.Internal.Type (
unpackBytes, unpackAppendBytesLazy, unpackAppendBytesStrict,
unpackChars, unpackAppendCharsLazy, unpackAppendCharsStrict,
unsafePackAddress, unsafePackLenAddress,
unsafePackLiteral, unsafePackLenLiteral,
unsafePackLiteral, unsafePackLenLiteral, byteCountLiteral,

-- * Low level imperative construction
empty,
Expand Down Expand Up @@ -481,13 +481,22 @@ unsafePackLenAddress len addr# = do
-- @since 0.11.1.0
unsafePackLiteral :: Addr# -> ByteString
unsafePackLiteral addr# =
unsafePackLenLiteral (byteCountLiteral addr#) addr#
{-# INLINE unsafePackLiteral #-}

-- | Byte count of null-terminated primitive literal string excluding the
-- terminating null byte.
byteCountLiteral :: Addr# -> Int
byteCountLiteral addr# =
#if __GLASGOW_HASKELL__ >= 811
unsafePackLenLiteral (I# (cstringLength# addr#)) addr#
I# (cstringLength# addr#)
#else
let len = accursedUnutterablePerformIO (c_strlen (Ptr addr#))
in unsafePackLenLiteral (fromIntegral len) addr#
fromIntegral (pure_strlen (Ptr addr#))

foreign import ccall unsafe "string.h strlen" pure_strlen
:: CString -> CSize
#endif
{-# INLINE unsafePackLiteral #-}
{-# INLINE byteCountLiteral #-}


-- | See 'unsafePackLiteral'. This function is similar,
Expand Down
26 changes: 24 additions & 2 deletions bench/BenchAll.hs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as L8
import Data.ByteString.Internal (byteCountLiteral)

import Data.ByteString.Builder
import Data.ByteString.Builder.Extra (byteStringCopy,
Expand All @@ -33,10 +34,13 @@ import Data.ByteString.Builder.Extra (byteStringCopy,
import Data.ByteString.Builder.Internal (ensureFree)
import Data.ByteString.Builder.Prim (BoundedPrim, FixedPrim,
(>$<))
import qualified Data.ByteString.Builder.Internal as BI
import qualified Data.ByteString.Builder.Prim as P
import qualified Data.ByteString.Builder.Prim.Internal as PI

import Foreign
import GHC.Exts (Addr#)
import GHC.Ptr (Ptr(..))

import System.Random

Expand Down Expand Up @@ -247,6 +251,18 @@ largeTraversalInput = S.concat (replicate 10 byteStringData)
smallTraversalInput :: S.ByteString
smallTraversalInput = S8.pack "The quick brown fox"

asciiBuf, utf8Buf :: Ptr Word8
asciiBuf = Ptr "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#
utf8Buf = Ptr "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xc0\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#

asciiLit, utf8Lit :: Ptr Word8 -> Builder
asciiLit str@(Ptr addr) = BI.asciiLiteralCopy str (byteCountLiteral addr)
utf8Lit str@(Ptr addr) = BI.modUtf8LitCopy str (byteCountLiteral addr)

asciiStr, utf8Str :: String
asciiStr = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
utf8Str = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\0xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

main :: IO ()
main = do
defaultMain
Expand All @@ -256,9 +272,15 @@ main = do
, benchB' "ensureFree 8" () (const (ensureFree 8))
, benchB' "intHost 1" 1 intHost
, benchB' "UTF-8 String (naive)" "hello world\0" fromString
, benchB' "UTF-8 String" () $ \() -> P.cstringUtf8 "hello world\0"#
, benchB' "UTF-8 String" () $ \() -> utf8Lit (Ptr "hello world\xc0\x80"#)
, benchB' "String (naive)" "hello world!" fromString
, benchB' "String" () $ \() -> P.cstring "hello world!"#
, benchB' "String" () $ \() -> asciiLit (Ptr "hello world!"#)
, benchB' "AsciiLit" () $ \() -> asciiLit asciiBuf
, benchB' "Utf8Lit" () $ \() -> utf8Lit utf8Buf
, benchB' "strLit" () $ \() -> string8 asciiStr
, benchB' "stringUtf8" () $ \() -> stringUtf8 utf8Str
, benchB' "strLitInline" () $ \() -> string8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
, benchB' "utf8LitInline" () $ \() -> stringUtf8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
]

, bgroup "Encoding wrappers"
Expand Down
12 changes: 1 addition & 11 deletions tests/builder/Data/ByteString/Builder/Prim/Tests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,7 @@ import Test.Tasty.QuickCheck

tests :: [TestTree]
tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8
, testsCombinatorsB, [testCString, testCStringUtf8] ]

testCString :: TestTree
testCString = testProperty "cstring" $
toLazyByteString (BP.cstring "hello world!"#) ==
LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!"

testCStringUtf8 :: TestTree
testCStringUtf8 = testProperty "cstringUtf8" $
toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) ==
LC.pack "hello" `L.append` L.singleton 0x00 `L.append` LC.pack "world!"
, testsCombinatorsB ]

------------------------------------------------------------------------------
-- Binary
Expand Down
Loading
Loading