1 {-# LANGUAGE BangPatterns #-} 2 {-# LANGUAGE CPP #-} 3 {-# LANGUAGE MagicHash #-} 4 {-# LANGUAGE OverloadedStrings #-} 5 6 ------------------------------------------------------------------------------ 7 module Snap.Internal.Parsing where 8 ------------------------------------------------------------------------------ 9 import Control.Applicative (Alternative ((<|>)), Applicative ((*>), (<*), pure), liftA2, (<$>)) 10 import Control.Arrow (first, second) 11 import Control.Monad (Monad (return), MonadPlus (mzero), liftM, when) 12 import Data.Attoparsec.ByteString.Char8 (IResult (Done, Fail, Partial), Parser, Result, anyChar, char, choice, decimal, endOfInput, feed, inClass, isDigit, isSpace, letter_ascii, many', match, option, parse, satisfy, skipSpace, skipWhile, string, take, takeTill, takeWhile) 13 import qualified Data.Attoparsec.ByteString.Char8 as AP 14 import Data.Bits (Bits ((.&.), (.|.), unsafeShiftL)) 15 import Data.ByteString.Builder (Builder, byteString, char8, toLazyByteString, word8) 16 import Data.ByteString.Char8 (ByteString) 17 import qualified Data.ByteString.Char8 as S 18 import Data.ByteString.Internal (c2w, w2c) 19 import qualified Data.ByteString.Lazy.Char8 as L 20 import Data.CaseInsensitive (CI) 21 import qualified Data.CaseInsensitive as CI (mk) 22 import Data.Char (Char, intToDigit, isAlpha, isAlphaNum, isAscii, isControl, isHexDigit, ord) 23 import Data.Int (Int64) 24 import Data.List (concat, intercalate, intersperse) 25 import Data.Map (Map) 26 import qualified Data.Map as Map (empty, insertWith', toList) 27 import Data.Maybe (Maybe (..), maybe) 28 import Data.Monoid (Monoid (mconcat, mempty), (<>)) 29 import Data.Word (Word8) 30 import GHC.Exts (Int (I#), uncheckedShiftRL#, word2Int#) 31 import GHC.Word (Word8 (..)) 32 import Prelude (Bool (..), Either (..), Enum (fromEnum, toEnum), Eq (..), Num (..), Ord (..), String, and, any, concatMap, elem, error, filter, flip, foldr, fst, id, map, not, otherwise, show, snd, ($), ($!), (&&), (++), (.), (||)) 33 import Snap.Internal.Http.Types (Cookie (Cookie)) 34 ------------------------------------------------------------------------------ 35 36 37 ------------------------------------------------------------------------------ 38 {-# INLINE fullyParse #-} 39 fullyParse :: ByteString -> Parser a -> Either String a 40 fullyParse = fullyParse' parse feed 41 42 {-# INLINE (<?>) #-} 43 (<?>) :: Parser a -> String -> Parser a 44 (<?>) a !b = (AP.<?>) a b 45 infix 0 <?> 46 47 ------------------------------------------------------------------------------ 48 {-# INLINE fullyParse' #-} 49 fullyParse' :: (Parser a -> ByteString -> Result a) 50 -> (Result a -> ByteString -> Result a) 51 -> ByteString 52 -> Parser a 53 -> Either String a 54 fullyParse' parseFunc feedFunc s p = 55 case r' of 56 (Fail _ context e) -> Left $ concat [ "Parsing " 57 , intercalate "/" context 58 , ": " 59 , e 60 , "." 61 ] 62 (Partial _) -> Left "parse failed" -- expected to be impossible 63 (Done _ x) -> Right x 64 where 65 r = parseFunc p s 66 r' = feedFunc r "" 67 68 ------------------------------------------------------------------------------ 69 -- Parsers for different tokens in an HTTP request. 70 71 ------------------------------------------------------------------------------ 72 parseNum :: Parser Int64 73 parseNum = decimal 74 75 76 ------------------------------------------------------------------------------ 77 untilEOL :: Parser ByteString 78 untilEOL = takeWhile notend <?> "untilEOL" 79 where 80 notend c = not $ c == '\r' || c == '\n' 81 82 83 ------------------------------------------------------------------------------ 84 crlf :: Parser ByteString 85 crlf = string "\r\n" <?> "crlf" 86 87 88 ------------------------------------------------------------------------------ 89 toTable :: (Char -> Bool) -> (Char -> Bool) 90 toTable f = inClass $ filter f $ map w2c [0..255] 91 {-# INLINE toTable #-} 92 93 94 ------------------------------------------------------------------------------ 95 skipFieldChars :: Parser () 96 skipFieldChars = skipWhile isFieldChar 97 98 99 ------------------------------------------------------------------------------ 100 isFieldChar :: Char -> Bool 101 isFieldChar = toTable f 102 where 103 f c = (isDigit c) || (isAlpha c) || c == '-' || c == '_' 104 105 106 ------------------------------------------------------------------------------ 107 -- | Parser for request headers. 108 pHeaders :: Parser [(ByteString, ByteString)] 109 pHeaders = many' header <?> "headers" 110 where 111 -------------------------------------------------------------------------- 112 slurp p = fst <$> match p 113 114 -------------------------------------------------------------------------- 115 header = {-# SCC "pHeaders/header" #-} 116 liftA2 (,) 117 fieldName 118 (char ':' *> skipSpace *> contents) 119 120 -------------------------------------------------------------------------- 121 fieldName = {-# SCC "pHeaders/fieldName" #-} 122 slurp (letter_ascii *> skipFieldChars) 123 124 -------------------------------------------------------------------------- 125 contents = {-# SCC "pHeaders/contents" #-} 126 liftA2 S.append 127 (untilEOL <* crlf) 128 (continuation <|> pure S.empty) 129 130 -------------------------------------------------------------------------- 131 isLeadingWS w = {-# SCC "pHeaders/isLeadingWS" #-} 132 w == ' ' || w == '\t' 133 134 -------------------------------------------------------------------------- 135 leadingWhiteSpace = {-# SCC "pHeaders/leadingWhiteSpace" #-} 136 skipWhile1 isLeadingWS 137 138 -------------------------------------------------------------------------- 139 continuation = {-# SCC "pHeaders/continuation" #-} 140 liftA2 S.cons 141 (leadingWhiteSpace *> pure ' ') 142 contents 143 144 -------------------------------------------------------------------------- 145 skipWhile1 f = satisfy f *> skipWhile f 146 147 148 ------------------------------------------------------------------------------ 149 -- unhelpfully, the spec mentions "old-style" cookies that don't have quotes 150 -- around the value. wonderful. 151 pWord :: Parser ByteString 152 pWord = pQuotedString <|> (takeWhile (/= ';')) 153 154 155 ------------------------------------------------------------------------------ 156 pQuotedString :: Parser ByteString 157 pQuotedString = q *> quotedText <* q 158 where 159 quotedText = (S.concat . L.toChunks . toLazyByteString) <$> f mempty 160 161 f soFar = do 162 t <- takeWhile qdtext 163 let soFar' = soFar <> byteString t 164 -- RFC says that backslash only escapes for <"> 165 choice [ string "\\\"" *> f (soFar' <> char8 '"') 166 , pure soFar' ] 167 168 q = char '"' 169 qdtext = matchAll [ isRFCText, (/= '"'), (/= '\\') ] 170 171 172 ------------------------------------------------------------------------------ 173 {-# INLINE isRFCText #-} 174 isRFCText :: Char -> Bool 175 isRFCText = not . isControl 176 177 178 ------------------------------------------------------------------------------ 179 {-# INLINE matchAll #-} 180 matchAll :: [ Char -> Bool ] -> Char -> Bool 181 matchAll x c = and $ map ($ c) x 182 183 184 ------------------------------------------------------------------------------ 185 pAvPairs :: Parser [(ByteString, ByteString)] 186 pAvPairs = do 187 a <- pAvPair 188 b <- many' (skipSpace *> char ';' *> skipSpace *> pAvPair) 189 return $! a:b 190 191 192 ------------------------------------------------------------------------------ 193 {-# INLINE pAvPair #-} 194 pAvPair :: Parser (ByteString, ByteString) 195 pAvPair = do 196 key <- pToken <* skipSpace 197 val <- liftM trim (option "" $ char '=' *> skipSpace *> pWord) 198 return $! (key, val) 199 200 201 ------------------------------------------------------------------------------ 202 pParameter :: Parser (ByteString, ByteString) 203 pParameter = parser <?> "pParameter" 204 where 205 parser = do 206 key <- pToken <* skipSpace 207 val <- liftM trim (char '=' *> skipSpace *> pWord) 208 return $! (trim key, val) 209 210 211 ------------------------------------------------------------------------------ 212 {-# INLINE trim #-} 213 trim :: ByteString -> ByteString 214 trim = snd . S.span isSpace . fst . S.spanEnd isSpace 215 216 217 ------------------------------------------------------------------------------ 218 pValueWithParameters :: Parser (ByteString, [(CI ByteString, ByteString)]) 219 pValueWithParameters = parser <?> "pValueWithParameters" 220 where 221 parser = do 222 value <- liftM trim (skipSpace *> takeWhile (/= ';')) 223 params <- many' pParam 224 endOfInput 225 return (value, map (first CI.mk) params) 226 pParam = skipSpace *> char ';' *> skipSpace *> pParameter 227 228 229 ------------------------------------------------------------------------------ 230 pContentTypeWithParameters :: Parser ( ByteString 231 , [(CI ByteString, ByteString)] ) 232 pContentTypeWithParameters = parser <?> "pContentTypeWithParameters" 233 where 234 parser = do 235 value <- liftM trim (skipSpace *> takeWhile (not . isSep)) 236 params <- many' (skipSpace *> satisfy isSep *> skipSpace *> pParameter) 237 endOfInput 238 return $! (value, map (first CI.mk) params) 239 240 isSep c = c == ';' || c == ',' 241 242 243 ------------------------------------------------------------------------------ 244 {-# INLINE pToken #-} 245 pToken :: Parser ByteString 246 pToken = takeWhile isToken 247 248 249 ------------------------------------------------------------------------------ 250 {-# INLINE isToken #-} 251 isToken :: Char -> Bool 252 isToken = toTable f 253 where 254 f = matchAll [ isAscii 255 , not . isControl 256 , not . isSpace 257 , not . flip elem [ '(', ')', '<', '>', '@', ',', ';' 258 , ':', '\\', '\"', '/', '[', ']' 259 , '?', '=', '{', '}' ] 260 ] 261 262 263 ------------------ 264 -- Url encoding -- 265 ------------------ 266 267 ------------------------------------------------------------------------------ 268 {-# INLINE parseToCompletion #-} 269 parseToCompletion :: Parser a -> ByteString -> Maybe a 270 parseToCompletion p s = toResult $ finish r 271 where 272 r = parse p s 273 274 toResult (Done _ c) = Just c 275 toResult _ = Nothing 276 277 278 ------------------------------------------------------------------------------ 279 type DList a = [a] -> [a] 280 281 pUrlEscaped :: Parser ByteString 282 pUrlEscaped = do 283 sq <- nextChunk id 284 return $! S.concat $ sq [] 285 286 where 287 -------------------------------------------------------------------------- 288 nextChunk :: DList ByteString -> Parser (DList ByteString) 289 nextChunk !s = (endOfInput *> pure s) <|> do 290 c <- anyChar 291 case c of 292 '+' -> plusSpace s 293 '%' -> percentEncoded s 294 _ -> unEncoded c s 295 296 -------------------------------------------------------------------------- 297 percentEncoded :: DList ByteString -> Parser (DList ByteString) 298 percentEncoded !l = do 299 hx <- take 2 300 when (S.length hx /= 2 || (not $ S.all isHexDigit hx)) $ 301 mzero 302 303 let code = w2c ((unsafeFromHex hx) :: Word8) 304 nextChunk $ l . ((S.singleton code) :) 305 306 -------------------------------------------------------------------------- 307 unEncoded :: Char -> DList ByteString -> Parser (DList ByteString) 308 unEncoded !c !l' = do 309 let l = l' . ((S.singleton c) :) 310 bs <- takeTill (flip elem ['%', '+']) 311 if S.null bs 312 then nextChunk l 313 else nextChunk $ l . (bs :) 314 315 -------------------------------------------------------------------------- 316 plusSpace :: DList ByteString -> Parser (DList ByteString) 317 plusSpace l = nextChunk (l . ((S.singleton ' ') :)) 318 319 320 ------------------------------------------------------------------------------ 321 -- "...Only alphanumerics [0-9a-zA-Z], the special characters "$-_.+!*'()," 322 -- [not including the quotes - ed], and reserved characters used for their 323 -- reserved purposes may be used unencoded within a URL." 324 325 326 327 328 ------------------------------------------------------------------------------ 329 -- | Decode an URL-escaped string (see 330 -- <http://tools.ietf.org/html/rfc2396.html#section-2.4>) 331 -- 332 -- Example: 333 -- 334 -- @ 335 -- ghci> 'urlDecode' "1+attoparsec+%7e%3d+3+*+10%5e-2+meters" 336 -- Just "1 attoparsec ~= 3 * 10^-2 meters" 337 -- @ 338 urlDecode :: ByteString -> Maybe ByteString 339 urlDecode = parseToCompletion pUrlEscaped 340 {-# INLINE urlDecode #-} 341 342 343 ------------------------------------------------------------------------------ 344 -- | URL-escape a string (see 345 -- <http://tools.ietf.org/html/rfc2396.html#section-2.4>) 346 -- 347 -- Example: 348 -- 349 -- @ 350 -- ghci> 'urlEncode' "1 attoparsec ~= 3 * 10^-2 meters" 351 -- "1+attoparsec+%7e%3d+3+*+10%5e-2+meters" 352 -- @ 353 urlEncode :: ByteString -> ByteString 354 urlEncode = S.concat . L.toChunks . toLazyByteString . urlEncodeBuilder 355 {-# INLINE urlEncode #-} 356 357 358 ------------------------------------------------------------------------------ 359 -- | URL-escape a string (see 360 -- <http://tools.ietf.org/html/rfc2396.html#section-2.4>) into a 'Builder'. 361 -- 362 -- Example: 363 -- 364 -- @ 365 -- ghci> import "Data.ByteString.Builder" 366 -- ghci> 'toLazyByteString' . 'urlEncodeBuilder' $ "1 attoparsec ~= 3 * 10^-2 meters" 367 -- "1+attoparsec+%7e%3d+3+*+10%5e-2+meters" 368 -- @ 369 urlEncodeBuilder :: ByteString -> Builder 370 urlEncodeBuilder = go mempty 371 where 372 go !b !s = maybe b' esc (S.uncons y) 373 where 374 (x,y) = S.span urlEncodeClean s 375 b' = b <> byteString x 376 esc (c,r) = let b'' = if c == ' ' 377 then b' <> char8 '+' 378 else b' <> hexd c 379 in go b'' r 380 381 382 ------------------------------------------------------------------------------ 383 urlEncodeClean :: Char -> Bool 384 urlEncodeClean = toTable f 385 where 386 f c = any ($ c) [\c' -> isAscii c' && isAlphaNum c' 387 , flip elem [ '$', '_', '-', '.', '!' 388 , '*' , '\'', '(', ')', ',' ]] 389 390 391 ------------------------------------------------------------------------------ 392 hexd :: Char -> Builder 393 hexd c0 = char8 '%' <> word8 hi <> word8 low 394 where 395 !c = c2w c0 396 toDigit = c2w . intToDigit 397 !low = toDigit $ fromEnum $ c .&. 0xf 398 !hi = toDigit $ (c .&. 0xf0) `shiftr` 4 399 400 shiftr (W8# a#) (I# b#) = I# (word2Int# (uncheckedShiftRL# a# b#)) 401 402 403 ------------------------------------------------------------------------------ 404 finish :: Result a -> Result a 405 finish (Partial f) = flip feed "" $ f "" 406 finish x = x 407 408 409 --------------------------------------- 410 -- application/x-www-form-urlencoded -- 411 --------------------------------------- 412 413 ------------------------------------------------------------------------------ 414 -- | Parse a string encoded in @application/x-www-form-urlencoded@ < http://en.wikipedia.org/wiki/POST_%28HTTP%29#Use_for_submitting_web_forms format>. 415 -- 416 -- Example: 417 -- 418 -- @ 419 -- ghci> 'parseUrlEncoded' "Name=John+Doe&Name=Jane+Doe&Age=23&Formula=a+%2B+b+%3D%3D+13%25%21" 420 -- 'Data.Map.fromList' [("Age",["23"]),("Formula",["a + b == 13%!"]),("Name",["John Doe","Jane Doe"])] 421 -- @ 422 parseUrlEncoded :: ByteString -> Map ByteString [ByteString] 423 parseUrlEncoded s = foldr ins Map.empty decoded 424 425 where 426 -------------------------------------------------------------------------- 427 ins (!k,v) !m = Map.insertWith' (++) k [v] m 428 429 -------------------------------------------------------------------------- 430 parts :: [(ByteString,ByteString)] 431 parts = map breakApart $ 432 S.splitWith (\c -> c == '&' || c == ';') s 433 434 -------------------------------------------------------------------------- 435 breakApart = (second (S.drop 1)) . S.break (== '=') 436 437 -------------------------------------------------------------------------- 438 urldecode = parseToCompletion pUrlEscaped 439 440 -------------------------------------------------------------------------- 441 decodeOne (a,b) = do 442 !a' <- urldecode a 443 !b' <- urldecode b 444 return $! (a',b') 445 446 -------------------------------------------------------------------------- 447 decoded = go id parts 448 where 449 go !dl [] = dl [] 450 go !dl (x:xs) = maybe (go dl xs) 451 (\p -> go (dl . (p:)) xs) 452 (decodeOne x) 453 454 455 ------------------------------------------------------------------------------ 456 -- | Like 'printUrlEncoded', but produces a 'Builder' instead of a 457 -- 'ByteString'. Useful for constructing a large string efficiently in 458 -- a single step. 459 -- 460 -- Example: 461 -- 462 -- @ 463 -- ghci> import "Data.Map" 464 -- ghci> import "Data.Monoid" 465 -- ghci> import "Data.ByteString.Builder" 466 -- ghci> let bldr = 'buildUrlEncoded' ('Data.Map.fromList' [("Name", ["John Doe"]), ("Age", ["23"])]) 467 -- ghci> 'toLazyByteString' $ 'byteString' "http://example.com/script?" <> bldr 468 -- "http://example.com/script?Age=23&Name=John+Doe" 469 -- @ 470 buildUrlEncoded :: Map ByteString [ByteString] -> Builder 471 buildUrlEncoded m = mconcat builders 472 where 473 builders = intersperse (char8 '&') $ 474 concatMap encodeVS $ Map.toList m 475 476 encodeVS (k,vs) = map (encodeOne k) vs 477 478 encodeOne k v = mconcat [ urlEncodeBuilder k 479 , char8 '=' 480 , urlEncodeBuilder v ] 481 482 483 ------------------------------------------------------------------------------ 484 -- | Given a collection of key-value pairs with possibly duplicate 485 -- keys (represented as a 'Data.Map.Map'), construct a string in 486 -- @application/x-www-form-urlencoded@ format. 487 -- 488 -- Example: 489 -- 490 -- @ 491 -- ghci> 'printUrlEncoded' ('Data.Map.fromList' [("Name", ["John Doe"]), ("Age", ["23"])]) 492 -- "Age=23&Name=John+Doe" 493 -- @ 494 printUrlEncoded :: Map ByteString [ByteString] -> ByteString 495 printUrlEncoded = S.concat . L.toChunks . toLazyByteString . buildUrlEncoded 496 497 498 -------------------- 499 -- Cookie parsing -- 500 -------------------- 501 502 ------------------------------------------------------------------------------ 503 -- these definitions try to mirror RFC-2068 (the HTTP/1.1 spec) and RFC-2109 504 -- (cookie spec): please point out any errors! 505 ------------------------------------------------------------------------------ 506 pCookies :: Parser [Cookie] 507 pCookies = do 508 -- grab kvps and turn to strict bytestrings 509 kvps <- pAvPairs 510 return $! map toCookie $ filter (not . S.isPrefixOf "$" . fst) kvps 511 512 where 513 toCookie (nm,val) = Cookie nm val Nothing Nothing Nothing False False 514 515 516 ------------------------------------------------------------------------------ 517 parseCookie :: ByteString -> Maybe [Cookie] 518 parseCookie = parseToCompletion pCookies 519 520 521 ----------------------- 522 -- utility functions -- 523 ----------------------- 524 525 ------------------------------------------------------------------------------ 526 unsafeFromHex :: (Enum a, Num a, Bits a) => ByteString -> a 527 unsafeFromHex = S.foldl' f 0 528 where 529 #if MIN_VERSION_base(4,5,0) 530 sl = unsafeShiftL 531 #else 532 sl = shiftL 533 #endif 534 535 f !cnt !i = sl cnt 4 .|. nybble i 536 537 nybble c | c >= '0' && c <= '9' = toEnum $! fromEnum c - fromEnum '0' 538 | c >= 'a' && c <= 'f' = toEnum $! 10 + fromEnum c - fromEnum 'a' 539 | c >= 'A' && c <= 'F' = toEnum $! 10 + fromEnum c - fromEnum 'A' 540 | otherwise = error $ "bad hex digit: " ++ show c 541 {-# INLINE unsafeFromHex #-} 542 543 544 ------------------------------------------------------------------------------ 545 -- Note: only works for nonnegative naturals 546 unsafeFromNat :: (Enum a, Num a, Bits a) => ByteString -> a 547 unsafeFromNat = S.foldl' f 0 548 where 549 zero = ord '0' 550 f !cnt !i = cnt * 10 + toEnum (digitToInt i) 551 552 digitToInt c = if d >= 0 && d <= 9 553 then d 554 else error $ "bad digit: '" ++ [c] ++ "'" 555 where 556 !d = ord c - zero 557 {-# INLINE unsafeFromNat #-}