1 {-# LANGUAGE BangPatterns      #-}
    2 {-# LANGUAGE CPP               #-}
    3 {-# LANGUAGE MagicHash         #-}
    4 {-# LANGUAGE OverloadedStrings #-}
    5 
    6 ------------------------------------------------------------------------------
    7 module Snap.Internal.Parsing where
    8 ------------------------------------------------------------------------------
    9 import           Control.Applicative              (Alternative ((<|>)), Applicative ((*>), (<*), pure), liftA2, (<$>))
   10 import           Control.Arrow                    (first, second)
   11 import           Control.Monad                    (Monad (return), MonadPlus (mzero), liftM, when)
   12 import           Data.Attoparsec.ByteString.Char8 (IResult (Done, Fail, Partial), Parser, Result, anyChar, char, choice, decimal, endOfInput, feed, inClass, isDigit, isSpace, letter_ascii, many', match, option, parse, satisfy, skipSpace, skipWhile, string, take, takeTill, takeWhile)
   13 import qualified Data.Attoparsec.ByteString.Char8 as AP
   14 import           Data.Bits                        (Bits ((.&.), (.|.), unsafeShiftL))
   15 import           Data.ByteString.Builder          (Builder, byteString, char8, toLazyByteString, word8)
   16 import           Data.ByteString.Char8            (ByteString)
   17 import qualified Data.ByteString.Char8            as S
   18 import           Data.ByteString.Internal         (c2w, w2c)
   19 import qualified Data.ByteString.Lazy.Char8       as L
   20 import           Data.CaseInsensitive             (CI)
   21 import qualified Data.CaseInsensitive             as CI (mk)
   22 import           Data.Char                        (Char, intToDigit, isAlpha, isAlphaNum, isAscii, isControl, isHexDigit, ord)
   23 import           Data.Int                         (Int64)
   24 import           Data.List                        (concat, intercalate, intersperse)
   25 import           Data.Map                         (Map)
   26 import qualified Data.Map                         as Map (empty, insertWith', toList)
   27 import           Data.Maybe                       (Maybe (..), maybe)
   28 import           Data.Monoid                      (Monoid (mconcat, mempty), (<>))
   29 import           Data.Word                        (Word8)
   30 import           GHC.Exts                         (Int (I#), uncheckedShiftRL#, word2Int#)
   31 import           GHC.Word                         (Word8 (..))
   32 import           Prelude                          (Bool (..), Either (..), Enum (fromEnum, toEnum), Eq (..), Num (..), Ord (..), String, and, any, concatMap, elem, error, filter, flip, foldr, fst, id, map, not, otherwise, show, snd, ($), ($!), (&&), (++), (.), (||))
   33 import           Snap.Internal.Http.Types         (Cookie (Cookie))
   34 ------------------------------------------------------------------------------
   35 
   36 
   37 ------------------------------------------------------------------------------
   38 {-# INLINE fullyParse #-}
   39 fullyParse :: ByteString -> Parser a -> Either String a
   40 fullyParse = fullyParse' parse feed
   41 
   42 {-# INLINE (<?>) #-}
   43 (<?>) :: Parser a -> String -> Parser a
   44 (<?>) a !b = (AP.<?>) a b
   45 infix 0 <?>
   46 
   47 ------------------------------------------------------------------------------
   48 {-# INLINE fullyParse' #-}
   49 fullyParse' :: (Parser a -> ByteString -> Result a)
   50             -> (Result a -> ByteString -> Result a)
   51             -> ByteString
   52             -> Parser a
   53             -> Either String a
   54 fullyParse' parseFunc feedFunc s p =
   55     case r' of
   56       (Fail _ context e) -> Left $ concat [ "Parsing "
   57                                           , intercalate "/" context
   58                                           , ": "
   59                                           , e
   60                                           , "."
   61                                           ]
   62       (Partial _)  -> Left "parse failed"  -- expected to be impossible
   63       (Done _ x)   -> Right x
   64   where
   65     r  = parseFunc p s
   66     r' = feedFunc r ""
   67 
   68 ------------------------------------------------------------------------------
   69 -- Parsers for different tokens in an HTTP request.
   70 
   71 ------------------------------------------------------------------------------
   72 parseNum :: Parser Int64
   73 parseNum = decimal
   74 
   75 
   76 ------------------------------------------------------------------------------
   77 untilEOL :: Parser ByteString
   78 untilEOL = takeWhile notend <?> "untilEOL"
   79   where
   80     notend c = not $ c == '\r' || c == '\n'
   81 
   82 
   83 ------------------------------------------------------------------------------
   84 crlf :: Parser ByteString
   85 crlf = string "\r\n" <?> "crlf"
   86 
   87 
   88 ------------------------------------------------------------------------------
   89 toTable :: (Char -> Bool) -> (Char -> Bool)
   90 toTable f = inClass $ filter f $ map w2c [0..255]
   91 {-# INLINE toTable #-}
   92 
   93 
   94 ------------------------------------------------------------------------------
   95 skipFieldChars :: Parser ()
   96 skipFieldChars = skipWhile isFieldChar
   97 
   98 
   99 ------------------------------------------------------------------------------
  100 isFieldChar :: Char -> Bool
  101 isFieldChar = toTable f
  102   where
  103     f c = (isDigit c) || (isAlpha c) || c == '-' || c == '_'
  104 
  105 
  106 ------------------------------------------------------------------------------
  107 -- | Parser for request headers.
  108 pHeaders :: Parser [(ByteString, ByteString)]
  109 pHeaders = many' header <?> "headers"
  110   where
  111     --------------------------------------------------------------------------
  112     slurp p = fst <$> match p
  113 
  114     --------------------------------------------------------------------------
  115     header            = {-# SCC "pHeaders/header" #-}
  116                         liftA2 (,)
  117                             fieldName
  118                             (char ':' *> skipSpace *> contents)
  119 
  120     --------------------------------------------------------------------------
  121     fieldName         = {-# SCC "pHeaders/fieldName" #-}
  122                         slurp (letter_ascii *> skipFieldChars)
  123 
  124     --------------------------------------------------------------------------
  125     contents          = {-# SCC "pHeaders/contents" #-}
  126                         liftA2 S.append
  127                             (untilEOL <* crlf)
  128                             (continuation <|> pure S.empty)
  129 
  130     --------------------------------------------------------------------------
  131     isLeadingWS w     = {-# SCC "pHeaders/isLeadingWS" #-}
  132                         w == ' ' || w == '\t'
  133 
  134     --------------------------------------------------------------------------
  135     leadingWhiteSpace = {-# SCC "pHeaders/leadingWhiteSpace" #-}
  136                         skipWhile1 isLeadingWS
  137 
  138     --------------------------------------------------------------------------
  139     continuation      = {-# SCC "pHeaders/continuation" #-}
  140                         liftA2 S.cons
  141                                (leadingWhiteSpace *> pure ' ')
  142                                contents
  143 
  144     --------------------------------------------------------------------------
  145     skipWhile1 f = satisfy f *> skipWhile f
  146 
  147 
  148 ------------------------------------------------------------------------------
  149 -- unhelpfully, the spec mentions "old-style" cookies that don't have quotes
  150 -- around the value. wonderful.
  151 pWord :: Parser ByteString
  152 pWord = pQuotedString <|> (takeWhile (/= ';'))
  153 
  154 
  155 ------------------------------------------------------------------------------
  156 pQuotedString :: Parser ByteString
  157 pQuotedString = q *> quotedText <* q
  158   where
  159     quotedText = (S.concat . L.toChunks . toLazyByteString) <$> f mempty
  160 
  161     f soFar = do
  162         t <- takeWhile qdtext
  163         let soFar' = soFar <> byteString t
  164         -- RFC says that backslash only escapes for <">
  165         choice [ string "\\\"" *> f (soFar' <> char8 '"')
  166                , pure soFar' ]
  167 
  168     q      = char '"'
  169     qdtext = matchAll [ isRFCText, (/= '"'), (/= '\\') ]
  170 
  171 
  172 ------------------------------------------------------------------------------
  173 {-# INLINE isRFCText #-}
  174 isRFCText :: Char -> Bool
  175 isRFCText = not . isControl
  176 
  177 
  178 ------------------------------------------------------------------------------
  179 {-# INLINE matchAll #-}
  180 matchAll :: [ Char -> Bool ] -> Char -> Bool
  181 matchAll x c = and $ map ($ c) x
  182 
  183 
  184 ------------------------------------------------------------------------------
  185 pAvPairs :: Parser [(ByteString, ByteString)]
  186 pAvPairs = do
  187     a <- pAvPair
  188     b <- many' (skipSpace *> char ';' *> skipSpace *> pAvPair)
  189     return $! a:b
  190 
  191 
  192 ------------------------------------------------------------------------------
  193 {-# INLINE pAvPair #-}
  194 pAvPair :: Parser (ByteString, ByteString)
  195 pAvPair = do
  196     key <- pToken <* skipSpace
  197     val <- liftM trim (option "" $ char '=' *> skipSpace *> pWord)
  198     return $! (key, val)
  199 
  200 
  201 ------------------------------------------------------------------------------
  202 pParameter :: Parser (ByteString, ByteString)
  203 pParameter = parser <?> "pParameter"
  204   where
  205     parser = do
  206         key <- pToken <* skipSpace
  207         val <- liftM trim (char '=' *> skipSpace *> pWord)
  208         return $! (trim key, val)
  209 
  210 
  211 ------------------------------------------------------------------------------
  212 {-# INLINE trim #-}
  213 trim :: ByteString -> ByteString
  214 trim = snd . S.span isSpace . fst . S.spanEnd isSpace
  215 
  216 
  217 ------------------------------------------------------------------------------
  218 pValueWithParameters :: Parser (ByteString, [(CI ByteString, ByteString)])
  219 pValueWithParameters = parser <?> "pValueWithParameters"
  220   where
  221     parser = do
  222         value  <- liftM trim (skipSpace *> takeWhile (/= ';'))
  223         params <- many' pParam
  224         endOfInput
  225         return (value, map (first CI.mk) params)
  226     pParam = skipSpace *> char ';' *> skipSpace *> pParameter
  227 
  228 
  229 ------------------------------------------------------------------------------
  230 pContentTypeWithParameters :: Parser ( ByteString
  231                                      , [(CI ByteString, ByteString)] )
  232 pContentTypeWithParameters = parser <?> "pContentTypeWithParameters"
  233   where
  234     parser = do
  235         value  <- liftM trim (skipSpace *> takeWhile (not . isSep))
  236         params <- many' (skipSpace *> satisfy isSep *> skipSpace *> pParameter)
  237         endOfInput
  238         return $! (value, map (first CI.mk) params)
  239 
  240     isSep c = c == ';' || c == ','
  241 
  242 
  243 ------------------------------------------------------------------------------
  244 {-# INLINE pToken #-}
  245 pToken :: Parser ByteString
  246 pToken = takeWhile isToken
  247 
  248 
  249 ------------------------------------------------------------------------------
  250 {-# INLINE isToken #-}
  251 isToken :: Char -> Bool
  252 isToken = toTable f
  253   where
  254     f = matchAll [ isAscii
  255                  , not . isControl
  256                  , not . isSpace
  257                  , not . flip elem [ '(', ')', '<', '>', '@', ',', ';'
  258                                    , ':', '\\', '\"', '/', '[', ']'
  259                                    , '?', '=', '{', '}' ]
  260                  ]
  261 
  262 
  263                               ------------------
  264                               -- Url encoding --
  265                               ------------------
  266 
  267 ------------------------------------------------------------------------------
  268 {-# INLINE parseToCompletion #-}
  269 parseToCompletion :: Parser a -> ByteString -> Maybe a
  270 parseToCompletion p s = toResult $ finish r
  271   where
  272     r = parse p s
  273 
  274     toResult (Done _ c) = Just c
  275     toResult _          = Nothing
  276 
  277 
  278 ------------------------------------------------------------------------------
  279 type DList a = [a] -> [a]
  280 
  281 pUrlEscaped :: Parser ByteString
  282 pUrlEscaped = do
  283     sq <- nextChunk id
  284     return $! S.concat $ sq []
  285 
  286   where
  287     --------------------------------------------------------------------------
  288     nextChunk :: DList ByteString -> Parser (DList ByteString)
  289     nextChunk !s = (endOfInput *> pure s) <|> do
  290         c <- anyChar
  291         case c of
  292           '+' -> plusSpace s
  293           '%' -> percentEncoded s
  294           _   -> unEncoded c s
  295 
  296     --------------------------------------------------------------------------
  297     percentEncoded :: DList ByteString -> Parser (DList ByteString)
  298     percentEncoded !l = do
  299         hx <- take 2
  300         when (S.length hx /= 2 || (not $ S.all isHexDigit hx)) $
  301              mzero
  302 
  303         let code = w2c ((unsafeFromHex hx) :: Word8)
  304         nextChunk $ l . ((S.singleton code) :)
  305 
  306     --------------------------------------------------------------------------
  307     unEncoded :: Char -> DList ByteString -> Parser (DList ByteString)
  308     unEncoded !c !l' = do
  309         let l = l' . ((S.singleton c) :)
  310         bs   <- takeTill (flip elem ['%', '+'])
  311         if S.null bs
  312           then nextChunk l
  313           else nextChunk $ l . (bs :)
  314 
  315     --------------------------------------------------------------------------
  316     plusSpace :: DList ByteString -> Parser (DList ByteString)
  317     plusSpace l = nextChunk (l . ((S.singleton ' ') :))
  318 
  319 
  320 ------------------------------------------------------------------------------
  321 -- "...Only alphanumerics [0-9a-zA-Z], the special characters "$-_.+!*'(),"
  322 -- [not including the quotes - ed], and reserved characters used for their
  323 -- reserved purposes may be used unencoded within a URL."
  324 
  325 
  326 
  327 
  328 ------------------------------------------------------------------------------
  329 -- | Decode an URL-escaped string (see
  330 -- <http://tools.ietf.org/html/rfc2396.html#section-2.4>)
  331 --
  332 -- Example:
  333 --
  334 -- @
  335 -- ghci> 'urlDecode' "1+attoparsec+%7e%3d+3+*+10%5e-2+meters"
  336 -- Just "1 attoparsec ~= 3 * 10^-2 meters"
  337 -- @
  338 urlDecode :: ByteString -> Maybe ByteString
  339 urlDecode = parseToCompletion pUrlEscaped
  340 {-# INLINE urlDecode #-}
  341 
  342 
  343 ------------------------------------------------------------------------------
  344 -- | URL-escape a string (see
  345 -- <http://tools.ietf.org/html/rfc2396.html#section-2.4>)
  346 --
  347 -- Example:
  348 --
  349 -- @
  350 -- ghci> 'urlEncode' "1 attoparsec ~= 3 * 10^-2 meters"
  351 -- "1+attoparsec+%7e%3d+3+*+10%5e-2+meters"
  352 -- @
  353 urlEncode :: ByteString -> ByteString
  354 urlEncode = S.concat . L.toChunks . toLazyByteString . urlEncodeBuilder
  355 {-# INLINE urlEncode #-}
  356 
  357 
  358 ------------------------------------------------------------------------------
  359 -- | URL-escape a string (see
  360 -- <http://tools.ietf.org/html/rfc2396.html#section-2.4>) into a 'Builder'.
  361 --
  362 -- Example:
  363 --
  364 -- @
  365 -- ghci> import "Data.ByteString.Builder"
  366 -- ghci> 'toLazyByteString' . 'urlEncodeBuilder' $ "1 attoparsec ~= 3 * 10^-2 meters"
  367 -- "1+attoparsec+%7e%3d+3+*+10%5e-2+meters"
  368 -- @
  369 urlEncodeBuilder :: ByteString -> Builder
  370 urlEncodeBuilder = go mempty
  371   where
  372     go !b !s = maybe b' esc (S.uncons y)
  373       where
  374         (x,y)     = S.span urlEncodeClean s
  375         b'        = b <> byteString x
  376         esc (c,r) = let b'' = if c == ' '
  377                                 then b' <> char8 '+'
  378                                 else b' <> hexd c
  379                     in go b'' r
  380 
  381 
  382 ------------------------------------------------------------------------------
  383 urlEncodeClean :: Char -> Bool
  384 urlEncodeClean = toTable f
  385   where
  386     f c = any ($ c) [\c' -> isAscii c' && isAlphaNum c'
  387                     , flip elem [ '$', '_', '-', '.', '!'
  388                                 , '*' , '\'', '(', ')', ',' ]]
  389 
  390 
  391 ------------------------------------------------------------------------------
  392 hexd :: Char -> Builder
  393 hexd c0 = char8 '%' <> word8 hi <> word8 low
  394   where
  395     !c        = c2w c0
  396     toDigit   = c2w . intToDigit
  397     !low      = toDigit $ fromEnum $ c .&. 0xf
  398     !hi       = toDigit $ (c .&. 0xf0) `shiftr` 4
  399 
  400     shiftr (W8# a#) (I# b#) = I# (word2Int# (uncheckedShiftRL# a# b#))
  401 
  402 
  403 ------------------------------------------------------------------------------
  404 finish :: Result a -> Result a
  405 finish (Partial f) = flip feed "" $ f ""
  406 finish x           = x
  407 
  408 
  409                     ---------------------------------------
  410                     -- application/x-www-form-urlencoded --
  411                     ---------------------------------------
  412 
  413 ------------------------------------------------------------------------------
  414 -- | Parse a string encoded in @application/x-www-form-urlencoded@ < http://en.wikipedia.org/wiki/POST_%28HTTP%29#Use_for_submitting_web_forms format>.
  415 --
  416 -- Example:
  417 --
  418 -- @
  419 -- ghci> 'parseUrlEncoded' "Name=John+Doe&Name=Jane+Doe&Age=23&Formula=a+%2B+b+%3D%3D+13%25%21"
  420 -- 'Data.Map.fromList' [("Age",["23"]),("Formula",["a + b == 13%!"]),("Name",["John Doe","Jane Doe"])]
  421 -- @
  422 parseUrlEncoded :: ByteString -> Map ByteString [ByteString]
  423 parseUrlEncoded s = foldr ins Map.empty decoded
  424 
  425   where
  426     --------------------------------------------------------------------------
  427     ins (!k,v) !m = Map.insertWith' (++) k [v] m
  428 
  429     --------------------------------------------------------------------------
  430     parts :: [(ByteString,ByteString)]
  431     parts = map breakApart $
  432             S.splitWith (\c -> c == '&' || c == ';') s
  433 
  434     --------------------------------------------------------------------------
  435     breakApart = (second (S.drop 1)) . S.break (== '=')
  436 
  437     --------------------------------------------------------------------------
  438     urldecode = parseToCompletion pUrlEscaped
  439 
  440     --------------------------------------------------------------------------
  441     decodeOne (a,b) = do
  442         !a' <- urldecode a
  443         !b' <- urldecode b
  444         return $! (a',b')
  445 
  446     --------------------------------------------------------------------------
  447     decoded = go id parts
  448       where
  449         go !dl []     = dl []
  450         go !dl (x:xs) = maybe (go dl xs)
  451                               (\p -> go (dl . (p:)) xs)
  452                               (decodeOne x)
  453 
  454 
  455 ------------------------------------------------------------------------------
  456 -- | Like 'printUrlEncoded', but produces a 'Builder' instead of a
  457 -- 'ByteString'. Useful for constructing a large string efficiently in
  458 -- a single step.
  459 --
  460 -- Example:
  461 --
  462 -- @
  463 -- ghci> import "Data.Map"
  464 -- ghci> import "Data.Monoid"
  465 -- ghci> import "Data.ByteString.Builder"
  466 -- ghci> let bldr = 'buildUrlEncoded' ('Data.Map.fromList' [("Name", ["John Doe"]), ("Age", ["23"])])
  467 -- ghci> 'toLazyByteString' $ 'byteString' "http://example.com/script?" <> bldr
  468 -- "http://example.com/script?Age=23&Name=John+Doe"
  469 -- @
  470 buildUrlEncoded :: Map ByteString [ByteString] -> Builder
  471 buildUrlEncoded m = mconcat builders
  472   where
  473     builders        = intersperse (char8 '&') $
  474                       concatMap encodeVS $ Map.toList m
  475 
  476     encodeVS (k,vs) = map (encodeOne k) vs
  477 
  478     encodeOne k v   = mconcat [ urlEncodeBuilder k
  479                               , char8 '='
  480                               , urlEncodeBuilder v ]
  481 
  482 
  483 ------------------------------------------------------------------------------
  484 -- | Given a collection of key-value pairs with possibly duplicate
  485 -- keys (represented as a 'Data.Map.Map'), construct a string in
  486 -- @application/x-www-form-urlencoded@ format.
  487 --
  488 -- Example:
  489 --
  490 -- @
  491 -- ghci> 'printUrlEncoded' ('Data.Map.fromList' [("Name", ["John Doe"]), ("Age", ["23"])])
  492 -- "Age=23&Name=John+Doe"
  493 -- @
  494 printUrlEncoded :: Map ByteString [ByteString] -> ByteString
  495 printUrlEncoded = S.concat . L.toChunks . toLazyByteString . buildUrlEncoded
  496 
  497 
  498                              --------------------
  499                              -- Cookie parsing --
  500                              --------------------
  501 
  502 ------------------------------------------------------------------------------
  503 -- these definitions try to mirror RFC-2068 (the HTTP/1.1 spec) and RFC-2109
  504 -- (cookie spec): please point out any errors!
  505 ------------------------------------------------------------------------------
  506 pCookies :: Parser [Cookie]
  507 pCookies = do
  508     -- grab kvps and turn to strict bytestrings
  509     kvps <- pAvPairs
  510     return $! map toCookie $ filter (not . S.isPrefixOf "$" . fst) kvps
  511 
  512   where
  513     toCookie (nm,val) = Cookie nm val Nothing Nothing Nothing False False
  514 
  515 
  516 ------------------------------------------------------------------------------
  517 parseCookie :: ByteString -> Maybe [Cookie]
  518 parseCookie = parseToCompletion pCookies
  519 
  520 
  521                             -----------------------
  522                             -- utility functions --
  523                             -----------------------
  524 
  525 ------------------------------------------------------------------------------
  526 unsafeFromHex :: (Enum a, Num a, Bits a) => ByteString -> a
  527 unsafeFromHex = S.foldl' f 0
  528   where
  529 #if MIN_VERSION_base(4,5,0)
  530     sl = unsafeShiftL
  531 #else
  532     sl = shiftL
  533 #endif
  534 
  535     f !cnt !i = sl cnt 4 .|. nybble i
  536 
  537     nybble c | c >= '0' && c <= '9' = toEnum $! fromEnum c - fromEnum '0'
  538              | c >= 'a' && c <= 'f' = toEnum $! 10 + fromEnum c - fromEnum 'a'
  539              | c >= 'A' && c <= 'F' = toEnum $! 10 + fromEnum c - fromEnum 'A'
  540              | otherwise            = error $ "bad hex digit: " ++ show c
  541 {-# INLINE unsafeFromHex #-}
  542 
  543 
  544 ------------------------------------------------------------------------------
  545 -- Note: only works for nonnegative naturals
  546 unsafeFromNat :: (Enum a, Num a, Bits a) => ByteString -> a
  547 unsafeFromNat = S.foldl' f 0
  548   where
  549     zero = ord '0'
  550     f !cnt !i = cnt * 10 + toEnum (digitToInt i)
  551 
  552     digitToInt c = if d >= 0 && d <= 9
  553                      then d
  554                      else error $ "bad digit: '" ++ [c] ++ "'"
  555       where
  556         !d = ord c - zero
  557 {-# INLINE unsafeFromNat #-}