-- | The CSV (comma-separated value) format is defined by RFC 4180,
--   \"Common Format and MIME Type for Comma-Separated Values (CSV) Files\",
--   <http://www.rfc-editor.org/rfc/rfc4180.txt>
--
--   This lazy parser can report all CSV formatting errors, whilst also
--   returning all the valid data, so the user can choose whether to
--   continue, to show warnings, or to halt on error.
--
--   Valid fields retain information about their original location in the
--   input, so a secondary parser from textual fields to typed values
--   can give intelligent error messages.
--
--   In a valid CSV file, all rows must have the same number of columns.
--   This parser will flag a row with the wrong number of columns as a error.
--   (But the error type contains the actual data, so the user can recover
--   it if desired.)  Completely blank lines are also treated as errors,
--   and again the user is free either to filter these out or convert them
--   to a row of actual null fields.

module Text.CSV.Lazy.ByteString
  ( -- * CSV types
    CSVTable
  , CSVRow
  , CSVField(..)
    -- * CSV parsing
  , CSVError(..)
  , CSVResult
  , csvErrors
  , csvTable
  , csvTableFull
  , csvTableHeader
  , parseCSV
  , parseDSV
    -- * Pretty-printing
  , ppCSVError
  , ppCSVField
  , ppCSVTable
  , ppDSVTable
    -- * Conversion between standard and simple representations
  , fromCSVTable
  , toCSVTable
    -- * Selection, validation, and algebra of CSV tables
  , selectFields
  , expectFields
  , mkEmptyColumn
  , joinCSV
  , mkCSVField
  ) where

--  , ppCSVTableAsTuples

import Data.List     (groupBy, partition, elemIndex, intercalate, takeWhile
                     ,deleteFirstsBy, nub)
import Data.Function (on)
import Data.Maybe    (fromJust)
import qualified Data.ByteString.Lazy.Char8 as BS
import Data.ByteString.Lazy.Char8 (ByteString)

-- | A CSV table is a sequence of rows.  All rows have the same number
--   of fields.
type CSVTable   = [CSVRow]

-- | A CSV row is just a sequence of fields.
type CSVRow     = [CSVField]

-- | A CSV field's content is stored with its logical row and column number,
--   as well as its textual extent.  This information is necessary if you
--   want to generate good error messages in a secondary parsing stage,
--   should you choose to convert the textual fields to typed data values.
data CSVField   = CSVField       { CSVField -> Int
csvRowNum        :: !Int
                                 , CSVField -> Int
csvColNum        :: !Int
                                 , CSVField -> (Int, Int)
csvTextStart     :: !(Int,Int)
                                 , CSVField -> (Int, Int)
csvTextEnd       :: !(Int,Int)
                                 , CSVField -> ByteString
csvFieldContent  :: !ByteString
                                 , CSVField -> Bool
csvFieldQuoted   :: !Bool }
                | CSVFieldError  { csvRowNum        :: !Int
                                 , csvColNum        :: !Int
                                 , csvTextStart     :: !(Int,Int)
                                 , csvTextEnd       :: !(Int,Int)
                                 , CSVField -> [Char]
csvFieldError    :: !String }
                                                    deriving (CSVField -> CSVField -> Bool
(CSVField -> CSVField -> Bool)
-> (CSVField -> CSVField -> Bool) -> Eq CSVField
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: CSVField -> CSVField -> Bool
== :: CSVField -> CSVField -> Bool
$c/= :: CSVField -> CSVField -> Bool
/= :: CSVField -> CSVField -> Bool
Eq,Int -> CSVField -> ShowS
[CSVField] -> ShowS
CSVField -> [Char]
(Int -> CSVField -> ShowS)
-> (CSVField -> [Char]) -> ([CSVField] -> ShowS) -> Show CSVField
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> CSVField -> ShowS
showsPrec :: Int -> CSVField -> ShowS
$cshow :: CSVField -> [Char]
show :: CSVField -> [Char]
$cshowList :: [CSVField] -> ShowS
showList :: [CSVField] -> ShowS
Show)

-- | A structured error type for CSV formatting mistakes.
data CSVError   = IncorrectRow   { CSVError -> Int
csvRow           :: Int
                                 , CSVError -> Int
csvColsExpected  :: Int
                                 , CSVError -> Int
csvColsActual    :: Int
                                 , CSVError -> [CSVField]
csvFields        :: [CSVField] }
                | BlankLine      { csvRow           :: !Int
                                 , csvColsExpected  :: !Int
                                 , csvColsActual    :: !Int
                                 , CSVError -> CSVField
csvField         :: CSVField }
                | FieldError     { csvField         :: CSVField }
                | DuplicateHeader{ csvColsExpected  :: !Int
                                 , CSVError -> Int
csvHeaderSerial  :: !Int
                                 , CSVError -> [Char]
csvDuplicate     :: !String }
                | NoData
                                                    deriving (CSVError -> CSVError -> Bool
(CSVError -> CSVError -> Bool)
-> (CSVError -> CSVError -> Bool) -> Eq CSVError
forall a. (a -> a -> Bool) -> (a -> a -> Bool) -> Eq a
$c== :: CSVError -> CSVError -> Bool
== :: CSVError -> CSVError -> Bool
$c/= :: CSVError -> CSVError -> Bool
/= :: CSVError -> CSVError -> Bool
Eq,Int -> CSVError -> ShowS
[CSVError] -> ShowS
CSVError -> [Char]
(Int -> CSVError -> ShowS)
-> (CSVError -> [Char]) -> ([CSVError] -> ShowS) -> Show CSVError
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> CSVError -> ShowS
showsPrec :: Int -> CSVError -> ShowS
$cshow :: CSVError -> [Char]
show :: CSVError -> [Char]
$cshowList :: [CSVError] -> ShowS
showList :: [CSVError] -> ShowS
Show)

-- | The result of parsing a CSV input is a mixed collection of errors
--   and valid rows.  This way of representing things is crucial to the
--   ability to parse lazily whilst still catching format errors.
type CSVResult  = [ Either [CSVError] [CSVField] ]

-- | Extract just the valid portions of a CSV parse.
csvTable    :: CSVResult -> CSVTable
csvTable :: CSVResult -> CSVTable
csvTable  CSVResult
r  = [ [CSVField]
row | Right [CSVField]
row <- CSVResult
r ]
-- | Extract just the errors from a CSV parse.
csvErrors   :: CSVResult -> [CSVError]
csvErrors :: CSVResult -> [CSVError]
csvErrors CSVResult
r  = [[CSVError]] -> [CSVError]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [ [CSVError]
err | Left [CSVError]
err  <- CSVResult
r ]
-- | Extract the full table, including invalid rows, with padding, and
--   de-duplicated headers.
csvTableFull:: CSVResult -> CSVTable
csvTableFull :: CSVResult -> CSVTable
csvTableFull = (Either [CSVError] [CSVField] -> [CSVField])
-> CSVResult -> CSVTable
forall a b. (a -> b) -> [a] -> [b]
map Either [CSVError] [CSVField] -> [CSVField]
beCareful (CSVResult -> CSVTable)
-> (CSVResult -> CSVResult) -> CSVResult -> CSVTable
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CSVResult -> CSVResult
deduplicate
    where beCareful :: Either [CSVError] [CSVField] -> [CSVField]
beCareful (Right [CSVField]
row) = [CSVField]
row
          beCareful (Left (r :: CSVError
r@IncorrectRow{}:[CSVError]
_)) =
              CSVError -> [CSVField]
csvFields CSVError
r [CSVField] -> [CSVField] -> [CSVField]
forall a. [a] -> [a] -> [a]
++
              Int -> CSVField -> [CSVField]
forall a. Int -> a -> [a]
replicate (CSVError -> Int
csvColsExpected CSVError
r Int -> Int -> Int
forall a. Num a => a -> a -> a
- CSVError -> Int
csvColsActual CSVError
r)
                        (Int -> Int -> ByteString -> CSVField
mkCSVField (CSVError -> Int
csvRow CSVError
r) Int
0 ByteString
BS.empty)
          beCareful (Left (r :: CSVError
r@BlankLine{}:[CSVError]
_)) =
              Int -> CSVField -> [CSVField]
forall a. Int -> a -> [a]
replicate (CSVError -> Int
csvColsExpected CSVError
r)
                        (Int -> Int -> ByteString -> CSVField
mkCSVField (CSVError -> Int
csvRow CSVError
r) Int
0 ByteString
BS.empty)
          beCareful (Left (r :: CSVError
r@DuplicateHeader{}:[CSVError]
_)) = -- obsolete with deduping
              Int -> CSVField -> [CSVField]
forall a. Int -> a -> [a]
replicate (CSVError -> Int
csvColsExpected CSVError
r)
                        (Int -> Int -> ByteString -> CSVField
mkCSVField Int
0 Int
0 ByteString
BS.empty)
          beCareful (Left (FieldError{}:[CSVError]
r))      = Either [CSVError] [CSVField] -> [CSVField]
beCareful ([CSVError] -> Either [CSVError] [CSVField]
forall a b. a -> Either a b
Left [CSVError]
r)
          beCareful (Left (CSVError
NoData:[CSVError]
_))            = []
          beCareful (Left [])                    = []

          deduplicate :: CSVResult -> CSVResult
deduplicate (Left (errs :: [CSVError]
errs@(DuplicateHeader{}:[CSVError]
_)):Right [CSVField]
heads:CSVResult
rows) = 
--               Right (reverse $ foldl replace [] heads)
                 [CSVField] -> Either [CSVError] [CSVField]
forall a b. b -> Either a b
Right ([CSVError] -> [(CSVField, Int)] -> [CSVField]
replaceInOrder [CSVError]
errs ([CSVField] -> [Int] -> [(CSVField, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip [CSVField]
heads [Int
0..]))
                 Either [CSVError] [CSVField] -> CSVResult -> CSVResult
forall a. a -> [a] -> [a]
: CSVResult
rows
          deduplicate CSVResult
rows = CSVResult
rows
{-
          replace output header
              | headerName `elem` map csvFieldContent output
                          = header{ csvFieldContent = headerName
                                            `BS.append` BS.pack "_duplicate" }
                                  : output
              | otherwise = header: output
              where headerName = csvFieldContent header
-}
          replaceInOrder :: [CSVError] -> [(CSVField, Int)] -> [CSVField]
replaceInOrder []       [(CSVField, Int)]
headers        = ((CSVField, Int) -> CSVField) -> [(CSVField, Int)] -> [CSVField]
forall a b. (a -> b) -> [a] -> [b]
map (CSVField, Int) -> CSVField
forall a b. (a, b) -> a
fst [(CSVField, Int)]
headers
          replaceInOrder [CSVError]
_        []             = []
          replaceInOrder (CSVError
d:[CSVError]
dups) ((CSVField
h,Int
n):[(CSVField, Int)]
headers)
              | CSVError -> Int
csvHeaderSerial CSVError
d Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
n = CSVField
h{ csvFieldContent = BS.pack
                                                (csvDuplicate d++"_"++show n) }
                                          CSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
: [CSVError] -> [(CSVField, Int)] -> [CSVField]
replaceInOrder [CSVError]
dups     [(CSVField, Int)]
headers
              | Bool
otherwise              = CSVField
hCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
: [CSVError] -> [(CSVField, Int)] -> [CSVField]
replaceInOrder (CSVError
dCSVError -> [CSVError] -> [CSVError]
forall a. a -> [a] -> [a]
:[CSVError]
dups) [(CSVField, Int)]
headers

-- | The header row of the CSV table, assuming it is non-empty.
csvTableHeader :: CSVResult -> [String]
csvTableHeader :: CSVResult -> [[Char]]
csvTableHeader = (CSVField -> [Char]) -> [CSVField] -> [[Char]]
forall a b. (a -> b) -> [a] -> [b]
map (ByteString -> [Char]
BS.unpack (ByteString -> [Char])
-> (CSVField -> ByteString) -> CSVField -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CSVField -> ByteString
csvFieldContent) ([CSVField] -> [[Char]])
-> (CSVResult -> [CSVField]) -> CSVResult -> [[Char]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CSVResult -> [CSVField]
forall {a} {t}. [Either a t] -> t
firstRow
    where firstRow :: [Either a t] -> t
firstRow (Left a
_: [Either a t]
rest) = [Either a t] -> t
firstRow [Either a t]
rest
          firstRow (Right t
x: [Either a t]
_)   = t
x


-- | A first-stage parser for CSV (comma-separated values) data.
--   The individual fields remain as text, but errors in CSV formatting
--   are reported.  Errors (containing unrecognisable rows/fields) are
--   interspersed with the valid rows/fields.
parseCSV :: ByteString -> CSVResult
parseCSV :: ByteString -> CSVResult
parseCSV = Bool -> Char -> ByteString -> CSVResult
parseDSV Bool
True Char
','

-- | Sometimes CSV is not comma-separated, but delimiter-separated
--   values (DSV).  The choice of delimiter is arbitrary, but semi-colon
--   is common in locales where comma is used as a decimal point, and tab
--   is also common.  The Boolean argument is
--   whether newlines should be accepted within quoted fields.  The CSV RFC
--   says newlines can occur in quotes, but other DSV formats might say
--   otherwise.  You can often get better error messages if newlines are
--   disallowed.
parseDSV :: Bool -> Char -> ByteString -> CSVResult
parseDSV :: Bool -> Char -> ByteString -> CSVResult
parseDSV Bool
qn Char
delim = CSVTable -> CSVResult
validate
                    (CSVTable -> CSVResult)
-> (ByteString -> CSVTable) -> ByteString -> CSVResult
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (CSVField -> CSVField -> Bool) -> [CSVField] -> CSVTable
forall a. (a -> a -> Bool) -> [a] -> [[a]]
groupBy (Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
(==)(Int -> Int -> Bool)
-> (CSVField -> Int) -> CSVField -> CSVField -> Bool
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on`CSVField -> Int
csvRowNum)
                    ([CSVField] -> CSVTable)
-> (ByteString -> [CSVField]) -> ByteString -> CSVTable
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Bool -> Char -> ByteString -> [CSVField]
lexCSV Bool
qn Char
delim

validate          :: [CSVRow] -> CSVResult
validate :: CSVTable -> CSVResult
validate []        = [[CSVError] -> Either [CSVError] [CSVField]
forall a b. a -> Either a b
Left [CSVError
NoData]]
validate xs :: CSVTable
xs@([CSVField]
x:CSVTable
_)  = [CSVField] -> CSVResult -> CSVResult
checkDuplicateHeaders [CSVField]
x (CSVResult -> CSVResult) -> CSVResult -> CSVResult
forall a b. (a -> b) -> a -> b
$ ([CSVField] -> Either [CSVError] [CSVField])
-> CSVTable -> CSVResult
forall a b. (a -> b) -> [a] -> [b]
map (Int -> [CSVField] -> Either [CSVError] [CSVField]
extractErrs ([CSVField] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [CSVField]
x)) CSVTable
xs

extractErrs       :: Int -> CSVRow -> Either [CSVError] CSVRow
extractErrs :: Int -> [CSVField] -> Either [CSVError] [CSVField]
extractErrs Int
size [CSVField]
row
    | [CSVField] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [CSVField]
row0 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
size Bool -> Bool -> Bool
&& [CSVField] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [CSVField]
errs0    = [CSVField] -> Either [CSVError] [CSVField]
forall a b. b -> Either a b
Right [CSVField]
row0
    | [CSVField] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [CSVField]
row0 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1    Bool -> Bool -> Bool
&& CSVField -> Bool
empty CSVField
field0  = [CSVError] -> Either [CSVError] [CSVField]
forall a b. a -> Either a b
Left [CSVField -> CSVError
blankLine CSVField
field0]
    | Bool
otherwise                            = [CSVError] -> Either [CSVError] [CSVField]
forall a b. a -> Either a b
Left ((CSVField -> CSVError) -> [CSVField] -> [CSVError]
forall a b. (a -> b) -> [a] -> [b]
map CSVField -> CSVError
convert [CSVField]
errs0
                                                   [CSVError] -> [CSVError] -> [CSVError]
forall a. [a] -> [a] -> [a]
++ [CSVField] -> [CSVError]
validateColumns [CSVField]
row0)
  where
  ([CSVField]
row0,[CSVField]
errs0)  = (CSVField -> Bool) -> [CSVField] -> ([CSVField], [CSVField])
forall a. (a -> Bool) -> [a] -> ([a], [a])
partition CSVField -> Bool
isField [CSVField]
row
  (CSVField
field0:[CSVField]
_)    = [CSVField]
row0

  isField :: CSVField -> Bool
isField (CSVField{})       = Bool
True
  isField (CSVFieldError{})  = Bool
False

  empty :: CSVField -> Bool
empty   f :: CSVField
f@(CSVField{})    = ByteString -> Bool
BS.null (CSVField -> ByteString
csvFieldContent CSVField
f)
  empty   CSVField
_                 = Bool
False

  convert :: CSVField -> CSVError
convert CSVField
err = FieldError {csvField :: CSVField
csvField = CSVField
err}

  validateColumns :: [CSVField] -> [CSVError]
validateColumns [CSVField]
r  =
      if [CSVField] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [CSVField]
r Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
size then []
      else [ IncorrectRow{ csvRow :: Int
csvRow  = if [CSVField] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [CSVField]
r then Int
0 else CSVField -> Int
csvRowNum ([CSVField] -> CSVField
forall a. HasCallStack => [a] -> a
head [CSVField]
r)
                         , csvColsExpected :: Int
csvColsExpected  = Int
size
                         , csvColsActual :: Int
csvColsActual    = [CSVField] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [CSVField]
r
                         , csvFields :: [CSVField]
csvFields        = [CSVField]
r } ]
  blankLine :: CSVField -> CSVError
blankLine CSVField
f = BlankLine{ csvRow :: Int
csvRow           = CSVField -> Int
csvRowNum CSVField
f
                         , csvColsExpected :: Int
csvColsExpected  = Int
size
                         , csvColsActual :: Int
csvColsActual    = Int
1
                         , csvField :: CSVField
csvField         = CSVField
f }

checkDuplicateHeaders :: CSVRow -> CSVResult -> CSVResult
checkDuplicateHeaders :: [CSVField] -> CSVResult -> CSVResult
checkDuplicateHeaders [CSVField]
row CSVResult
result =
    let headers :: [CSVField]
headers = [ CSVField
f | f :: CSVField
f@(CSVField{}) <- [CSVField]
row ]
        dups :: [CSVField]
dups    = (CSVField -> CSVField -> Bool)
-> [CSVField] -> [CSVField] -> [CSVField]
forall a. (a -> a -> Bool) -> [a] -> [a] -> [a]
deleteFirstsBy (ByteString -> ByteString -> Bool
forall a. Eq a => a -> a -> Bool
(==)(ByteString -> ByteString -> Bool)
-> (CSVField -> ByteString) -> CSVField -> CSVField -> Bool
forall b c a. (b -> b -> c) -> (a -> b) -> a -> a -> c
`on`CSVField -> ByteString
csvFieldContent)
                                 [CSVField]
headers ([CSVField] -> [CSVField]
forall a. Eq a => [a] -> [a]
nub [CSVField]
headers)
        n :: Int
n       = [CSVField] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [CSVField]
headers
    in if [CSVField] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [CSVField]
dups then CSVResult
result
       else [CSVError] -> Either [CSVError] [CSVField]
forall a b. a -> Either a b
Left ((CSVField -> CSVError) -> [CSVField] -> [CSVError]
forall a b. (a -> b) -> [a] -> [b]
map (\CSVField
d-> DuplicateHeader
                              { csvColsExpected :: Int
csvColsExpected = Int
n
                              , csvHeaderSerial :: Int
csvHeaderSerial = CSVField -> Int
csvColNum CSVField
d
                              , csvDuplicate :: [Char]
csvDuplicate = ByteString -> [Char]
BS.unpack (CSVField -> ByteString
csvFieldContent CSVField
d)})
                      [CSVField]
dups)
            Either [CSVError] [CSVField] -> CSVResult -> CSVResult
forall a. a -> [a] -> [a]
: CSVResult
result



-- Reading CSV data is essentially lexical, and can be implemented with a
-- simple finite state machine.  We keep track of logical row number,
-- logical column number (in tabular terms), and textual position (row,col)
-- to enable good error messages.
-- Positional data is retained even after successful lexing, in case a
-- second-stage field parser wants to complain.
--
-- A double-quoted CSV field may contain commas, newlines, and double quotes.

data CSVState  = CSVState  { CSVState -> Int
tableRow, CSVState -> Int
tableCol  :: !Int
                           , CSVState -> Int
textRow,  CSVState -> Int
textCol   :: !Int }
    deriving Int -> CSVState -> ShowS
[CSVState] -> ShowS
CSVState -> [Char]
(Int -> CSVState -> ShowS)
-> (CSVState -> [Char]) -> ([CSVState] -> ShowS) -> Show CSVState
forall a.
(Int -> a -> ShowS) -> (a -> [Char]) -> ([a] -> ShowS) -> Show a
$cshowsPrec :: Int -> CSVState -> ShowS
showsPrec :: Int -> CSVState -> ShowS
$cshow :: CSVState -> [Char]
show :: CSVState -> [Char]
$cshowList :: [CSVState] -> ShowS
showList :: [CSVState] -> ShowS
Show

incTableRow, incTableCol, incTextRow :: CSVState -> CSVState
incTableRow :: CSVState -> CSVState
incTableRow  CSVState
st = CSVState
st { tableRow  = tableRow  st + 1 , tableCol = 1 }
incTableCol :: CSVState -> CSVState
incTableCol  CSVState
st = CSVState
st { tableCol  = tableCol  st + 1 }
incTextRow :: CSVState -> CSVState
incTextRow   CSVState
st = CSVState
st { textRow   = textRow   st + 1 , textCol = 1 }

incTextCol :: Int -> CSVState -> CSVState
incTextCol :: Int -> CSVState -> CSVState
incTextCol Int
n CSVState
st = CSVState
st { textCol   = textCol   st + n }

here :: CSVState -> (Int,Int)
here :: CSVState -> (Int, Int)
here CSVState
st = (CSVState -> Int
textRow CSVState
st, CSVState -> Int
textCol CSVState
st)

-- Lexer is a small finite state machine.
lexCSV :: Bool -> Char -> ByteString -> [CSVField]
lexCSV :: Bool -> Char -> ByteString -> [CSVField]
lexCSV Bool
qn Char
delim =
    Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
qn Char
delim
              (CSVState{tableRow :: Int
tableRow=Int
1,tableCol :: Int
tableCol=Int
1,textRow :: Int
textRow=Int
1,textCol :: Int
textCol=Int
1}) (Int
1,Int
1)

getFields :: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields :: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
state (Int, Int)
begin ByteString
bs0
 = case ByteString -> Maybe (Char, ByteString)
BS.uncons ByteString
bs0 of
   Maybe (Char, ByteString)
Nothing -> []
   Just (Char
'"', ByteString
bs1) -> Bool
-> Char
-> CSVState
-> (Int, Int)
-> ByteString
-> ByteString
-> [CSVField]
doStringFieldContent Bool
q Char
d (Int -> CSVState -> CSVState
incTextCol Int
1 CSVState
state) (Int, Int)
begin
                                           ByteString
BS.empty ByteString
bs1
   Maybe (Char, ByteString)
_ ->
       case (Char -> Bool) -> ByteString -> (ByteString, ByteString)
BS.break Char -> Bool
interestingChar ByteString
bs0 of
       (ByteString
fieldBs, ByteString
bs1) ->
           let field :: CSVField
field   = CSVState -> (Int, Int) -> ByteString -> Bool -> CSVField
mkField CSVState
end (Int, Int)
begin ByteString
fieldBs Bool
False
               end :: CSVState
end     = Int -> CSVState -> CSVState
incTextCol (Int
lenInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1) (CSVState -> CSVState) -> CSVState -> CSVState
forall a b. (a -> b) -> a -> b
$ CSVState
state
               state' :: CSVState
state'  = CSVState -> CSVState
incTableCol (CSVState -> CSVState) -> CSVState -> CSVState
forall a b. (a -> b) -> a -> b
$ Int -> CSVState -> CSVState
incTextCol Int
2 CSVState
end
               stateNL :: CSVState
stateNL = CSVState -> CSVState
incTableRow (CSVState -> CSVState)
-> (CSVState -> CSVState) -> CSVState -> CSVState
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CSVState -> CSVState
incTextRow (CSVState -> CSVState) -> CSVState -> CSVState
forall a b. (a -> b) -> a -> b
$ CSVState
state
               len :: Int
len     = Int64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int64 -> Int) -> Int64 -> Int
forall a b. (a -> b) -> a -> b
$ ByteString -> Int64
BS.length ByteString
fieldBs
           in case ByteString -> Maybe (Char, ByteString)
BS.uncons ByteString
bs1 of
              Just (Char
c,ByteString
bs2)
                   | Char
cChar -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
d     -> CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
state' (CSVState -> (Int, Int)
here CSVState
state') ByteString
bs2
              Just (Char
'\r',ByteString
bs2) ->
                  case ByteString -> Maybe (Char, ByteString)
BS.uncons ByteString
bs2 of
                  Just (Char
'\n',ByteString
bs3)
                              -> CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
stateNL (CSVState -> (Int, Int)
here CSVState
stateNL) ByteString
bs3
                                 -- XXX This could be an error instead:
                  Maybe (Char, ByteString)
_           -> CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
stateNL (CSVState -> (Int, Int)
here CSVState
stateNL) ByteString
bs2
              Just (Char
'\n',ByteString
bs2) -> CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
stateNL (CSVState -> (Int, Int)
here CSVState
stateNL) ByteString
bs2
              Just (Char
'"', ByteString
_)   -> CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
:
                                 CSVState -> (Int, Int) -> [Char] -> CSVField
mkError CSVState
state' (Int, Int)
begin
                                         [Char]
"unexpected quote, resync at EOL"CSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
:
                                 Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
stateNL (CSVState -> (Int, Int)
here CSVState
stateNL)
                                           ((Char -> Bool) -> ByteString -> ByteString
BS.dropWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
'\n') ByteString
bs1)
              Just (Char, ByteString)
_          -> [CSVState -> (Int, Int) -> [Char] -> CSVField
mkError CSVState
state' (Int, Int)
begin [Char]
"XXX Can't happen"]
              Maybe (Char, ByteString)
Nothing         -> CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
stateNL (CSVState -> (Int, Int)
here CSVState
stateNL) ByteString
bs1
 where interestingChar :: Char -> Bool
interestingChar Char
'\r' = Bool
True
       interestingChar Char
'\n' = Bool
True
       interestingChar Char
'"'  = Bool
True
       interestingChar Char
c    | Char
cChar -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
d = Bool
True
       interestingChar Char
_    = Bool
False

doStringFieldContent :: Bool -> Char -> CSVState -> (Int, Int) -> ByteString
                     -> ByteString -> [CSVField]
doStringFieldContent :: Bool
-> Char
-> CSVState
-> (Int, Int)
-> ByteString
-> ByteString
-> [CSVField]
doStringFieldContent Bool
q Char
d CSVState
state (Int, Int)
begin ByteString
acc ByteString
bs1
 = case (Char -> Bool) -> ByteString -> (ByteString, ByteString)
BS.break Char -> Bool
interestingCharInsideString ByteString
bs1 of
   (ByteString
newBs, ByteString
bs2) ->
       let fieldBs :: ByteString
fieldBs = ByteString
acc ByteString -> ByteString -> ByteString
`BS.append` ByteString
newBs
           field :: CSVField
field   = CSVState -> (Int, Int) -> ByteString -> Bool -> CSVField
mkField CSVState
end  (Int, Int)
begin ByteString
fieldBs Bool
True
           end :: CSVState
end     = Int -> CSVState -> CSVState
incTextCol (Int
lenInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1) CSVState
state
           state' :: CSVState
state'  = CSVState -> CSVState
incTableCol (CSVState -> CSVState) -> CSVState -> CSVState
forall a b. (a -> b) -> a -> b
$ Int -> CSVState -> CSVState
incTextCol Int
3 CSVState
end
           stateNL :: CSVState
stateNL = CSVState -> CSVState
incTableRow (CSVState -> CSVState)
-> (CSVState -> CSVState) -> CSVState -> CSVState
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CSVState -> CSVState
incTextRow (CSVState -> CSVState) -> CSVState -> CSVState
forall a b. (a -> b) -> a -> b
$ CSVState
state
           len :: Int
len     = Int64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int64 -> Int) -> Int64 -> Int
forall a b. (a -> b) -> a -> b
$ ByteString -> Int64
BS.length ByteString
newBs
       in case ByteString -> Maybe (Char, ByteString)
BS.uncons ByteString
bs2 of
          Just (Char
'\r',ByteString
bs3) ->
              case ByteString -> Maybe (Char, ByteString)
BS.uncons ByteString
bs3 of
              Just (Char
'\n',ByteString
bs4)  | Bool
q ->
                   Bool
-> Char
-> CSVState
-> (Int, Int)
-> ByteString
-> ByteString
-> [CSVField]
doStringFieldContent Bool
q Char
d (CSVState -> CSVState
incTextRow CSVState
end) (Int, Int)
begin
                                     (ByteString
fieldBs ByteString -> ByteString -> ByteString
`BS.append` Char -> ByteString
BS.singleton Char
'\n') ByteString
bs4
              Maybe (Char, ByteString)
_ -> Bool
-> Char
-> CSVState
-> (Int, Int)
-> ByteString
-> ByteString
-> [CSVField]
doStringFieldContent Bool
q Char
d CSVState
end (Int, Int)
begin
                                     (ByteString
fieldBs ByteString -> ByteString -> ByteString
`BS.append` Char -> ByteString
BS.singleton Char
'\r') ByteString
bs3
          Just (Char
'\n',ByteString
bs3) | Bool
q ->
                   Bool
-> Char
-> CSVState
-> (Int, Int)
-> ByteString
-> ByteString
-> [CSVField]
doStringFieldContent Bool
q Char
d (CSVState -> CSVState
incTextRow CSVState
end) (Int, Int)
begin
                                     (ByteString
fieldBs ByteString -> ByteString -> ByteString
`BS.append` Char -> ByteString
BS.singleton Char
'\n') ByteString
bs3
          Just (Char
'\n',ByteString
bs3) ->
                   CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
:
                   CSVState -> (Int, Int) -> [Char] -> CSVField
mkError CSVState
end (Int, Int)
begin [Char]
"Found newline within quoted field"CSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
:
                   Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
stateNL (CSVState -> (Int, Int)
here CSVState
stateNL) ByteString
bs3
          Just (Char
'"', ByteString
bs3) ->
              case ByteString -> Maybe (Char, ByteString)
BS.uncons ByteString
bs3 of
              Just (Char
c,ByteString
bs4)
                   | Char
cChar -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
d     -> CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
state' (CSVState -> (Int, Int)
here CSVState
state') ByteString
bs4
              Just (Char
'\r',ByteString
bs4) ->
                  case ByteString -> Maybe (Char, ByteString)
BS.uncons ByteString
bs4 of
                  Just (Char
'\n',ByteString
bs5) ->
                       CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
stateNL (CSVState -> (Int, Int)
here CSVState
stateNL) ByteString
bs5
                       -- XXX This could be an error instead:
                  Maybe (Char, ByteString)
_ -> CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
stateNL (CSVState -> (Int, Int)
here CSVState
stateNL) ByteString
bs4
              Just (Char
'\n',ByteString
bs4) -> CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
stateNL (CSVState -> (Int, Int)
here CSVState
stateNL) ByteString
bs4
              Just (Char
'"',ByteString
bs4)  ->
                  Bool
-> Char
-> CSVState
-> (Int, Int)
-> ByteString
-> ByteString
-> [CSVField]
doStringFieldContent Bool
q Char
d (Int -> CSVState -> CSVState
incTextCol Int
3 CSVState
end) (Int, Int)
begin
                                      (ByteString
fieldBs ByteString -> ByteString -> ByteString
`BS.append` Char -> ByteString
BS.singleton Char
'"') ByteString
bs4
              Just (Char, ByteString)
_  -> CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
:
                         CSVState -> (Int, Int) -> [Char] -> CSVField
mkError CSVState
state' (Int, Int)
begin [Char]
"End-quote not followed by comma"CSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
:
                         Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
state' (CSVState -> (Int, Int)
here CSVState
state') ByteString
bs3
              Maybe (Char, ByteString)
Nothing -> CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
: Bool -> Char -> CSVState -> (Int, Int) -> ByteString -> [CSVField]
getFields Bool
q Char
d CSVState
stateNL (CSVState -> (Int, Int)
here CSVState
stateNL) ByteString
bs3
          Just (Char, ByteString)
_  -> [CSVState -> (Int, Int) -> [Char] -> CSVField
mkError CSVState
state' (Int, Int)
begin [Char]
"XXX Can't happen (string field)"]
          Maybe (Char, ByteString)
Nothing -> CSVField
fieldCSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
:
                     CSVState -> (Int, Int) -> [Char] -> CSVField
mkError CSVState
state' (Int, Int)
begin [Char]
"CSV data ends within a quoted string"
                     CSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
:[]
 where interestingCharInsideString :: Char -> Bool
interestingCharInsideString Char
'\r' = Bool
True
       interestingCharInsideString Char
'\n' = Bool
True
       interestingCharInsideString Char
'"'  = Bool
True
       interestingCharInsideString Char
_    = Bool
False

mkField :: CSVState -> (Int, Int) -> ByteString -> Bool -> CSVField
mkField :: CSVState -> (Int, Int) -> ByteString -> Bool -> CSVField
mkField CSVState
st (Int, Int)
begin ByteString
bs Bool
q =   CSVField { csvRowNum :: Int
csvRowNum       = CSVState -> Int
tableRow CSVState
st
                                   , csvColNum :: Int
csvColNum       = CSVState -> Int
tableCol CSVState
st
                                   , csvTextStart :: (Int, Int)
csvTextStart    = (Int, Int)
begin
                                   , csvTextEnd :: (Int, Int)
csvTextEnd      = (CSVState -> Int
textRow CSVState
st,CSVState -> Int
textCol CSVState
st)
                                   , csvFieldContent :: ByteString
csvFieldContent = ByteString
bs
                                   , csvFieldQuoted :: Bool
csvFieldQuoted  = Bool
q }

mkError :: CSVState -> (Int, Int) -> String -> CSVField
mkError :: CSVState -> (Int, Int) -> [Char] -> CSVField
mkError CSVState
st (Int, Int)
begin [Char]
e = CSVFieldError { csvRowNum :: Int
csvRowNum     = CSVState -> Int
tableRow CSVState
st
                                   , csvColNum :: Int
csvColNum     = CSVState -> Int
tableCol CSVState
st
                                   , csvTextStart :: (Int, Int)
csvTextStart  = (Int, Int)
begin
                                   , csvTextEnd :: (Int, Int)
csvTextEnd    = (CSVState -> Int
textRow CSVState
st,CSVState -> Int
textCol CSVState
st)
                                   , csvFieldError :: [Char]
csvFieldError = [Char]
e }


-- Some pretty-printing for structured CSV errors.
ppCSVError :: CSVError -> String
ppCSVError :: CSVError -> [Char]
ppCSVError (err :: CSVError
err@IncorrectRow{}) =
        [Char]
"\nRow "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++Int -> [Char]
forall a. Show a => a -> [Char]
show (CSVError -> Int
csvRow CSVError
err)[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++[Char]
" has wrong number of fields."[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
        [Char]
"\n    Expected "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++Int -> [Char]
forall a. Show a => a -> [Char]
show (CSVError -> Int
csvColsExpected CSVError
err)[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++[Char]
" but got "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
        Int -> [Char]
forall a. Show a => a -> [Char]
show (CSVError -> Int
csvColsActual CSVError
err)[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++[Char]
"."[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
        [Char]
"\n    The fields are:"[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
        Int -> ShowS
indent Int
8 ((CSVField -> [Char]) -> [CSVField] -> [Char]
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap CSVField -> [Char]
ppCSVField (CSVError -> [CSVField]
csvFields CSVError
err))
ppCSVError (err :: CSVError
err@BlankLine{}) =
        [Char]
"\nRow "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++Int -> [Char]
forall a. Show a => a -> [Char]
show (CSVError -> Int
csvRow CSVError
err)[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++[Char]
" is blank."[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
        [Char]
"\n    Expected "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++Int -> [Char]
forall a. Show a => a -> [Char]
show (CSVError -> Int
csvColsExpected CSVError
err)[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++[Char]
" fields."
ppCSVError (err :: CSVError
err@FieldError{}) = CSVField -> [Char]
ppCSVField (CSVError -> CSVField
csvField CSVError
err)
ppCSVError (err :: CSVError
err@DuplicateHeader{}) =
        [Char]
"\nThere are two (or more) identical column headers: "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
        ShowS
forall a. Show a => a -> [Char]
show (CSVError -> [Char]
csvDuplicate CSVError
err)[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++[Char]
"."[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
        [Char]
"\n    Column number "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++Int -> [Char]
forall a. Show a => a -> [Char]
show (CSVError -> Int
csvHeaderSerial CSVError
err)
ppCSVError (NoData{})         =
        [Char]
"\nNo usable data (after accounting for any other errors)."

-- | Pretty-printing for CSV fields, shows positional information in addition
--   to the textual content.
ppCSVField :: CSVField -> String
ppCSVField :: CSVField -> [Char]
ppCSVField (f :: CSVField
f@CSVField{}) =
        [Char]
"\n"[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++ByteString -> [Char]
BS.unpack (Bool -> ByteString -> ByteString
quoted (CSVField -> Bool
csvFieldQuoted CSVField
f) (CSVField -> ByteString
csvFieldContent CSVField
f))[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
        [Char]
"\nin row "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++Int -> [Char]
forall a. Show a => a -> [Char]
show (CSVField -> Int
csvRowNum CSVField
f)[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++[Char]
" at column "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++Int -> [Char]
forall a. Show a => a -> [Char]
show (CSVField -> Int
csvColNum CSVField
f)[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
        [Char]
" (textually from "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++(Int, Int) -> [Char]
forall a. Show a => a -> [Char]
show (CSVField -> (Int, Int)
csvTextStart CSVField
f)[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++[Char]
" to "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
        (Int, Int) -> [Char]
forall a. Show a => a -> [Char]
show (CSVField -> (Int, Int)
csvTextEnd CSVField
f)[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++[Char]
")"
ppCSVField (f :: CSVField
f@CSVFieldError{}) =
        [Char]
"\n"[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++CSVField -> [Char]
csvFieldError CSVField
f[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
        [Char]
"\nin row "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++Int -> [Char]
forall a. Show a => a -> [Char]
show (CSVField -> Int
csvRowNum CSVField
f)[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++[Char]
" at column "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++Int -> [Char]
forall a. Show a => a -> [Char]
show (CSVField -> Int
csvColNum CSVField
f)[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
        [Char]
" (textually from "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++(Int, Int) -> [Char]
forall a. Show a => a -> [Char]
show (CSVField -> (Int, Int)
csvTextStart CSVField
f)[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++[Char]
" to "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++
        (Int, Int) -> [Char]
forall a. Show a => a -> [Char]
show (CSVField -> (Int, Int)
csvTextEnd CSVField
f)


-- | Output a table back to a lazily-constructed string.  There are lots of
--   possible design decisions one could take, e.g. to re-arrange columns
--   back into something resembling their original order, but here we just
--   take the given table without looking at Row and Field numbers etc.
ppCSVTable :: CSVTable -> ByteString
ppCSVTable :: CSVTable -> ByteString
ppCSVTable = [ByteString] -> ByteString
BS.unlines ([ByteString] -> ByteString)
-> (CSVTable -> [ByteString]) -> CSVTable -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([CSVField] -> ByteString) -> CSVTable -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
map (ByteString -> [ByteString] -> ByteString
BS.intercalate ([Char] -> ByteString
BS.pack [Char]
",") ([ByteString] -> ByteString)
-> ([CSVField] -> [ByteString]) -> [CSVField] -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (CSVField -> ByteString) -> [CSVField] -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
map CSVField -> ByteString
ppField)
  where ppField :: CSVField -> ByteString
ppField CSVField
f = Bool -> ByteString -> ByteString
quoted (CSVField -> Bool
csvFieldQuoted CSVField
f) (CSVField -> ByteString
csvFieldContent CSVField
f)

-- | Output a table back to a lazily-constructed bytestring, using the given
--   delimiter char.  The Boolean argument is to repair fields containing
--   newlines, by replacing the nl with a space.
ppDSVTable :: Bool -> Char -> CSVTable -> ByteString
ppDSVTable :: Bool -> Char -> CSVTable -> ByteString
ppDSVTable Bool
nl Char
d = [ByteString] -> ByteString
BS.unlines ([ByteString] -> ByteString)
-> (CSVTable -> [ByteString]) -> CSVTable -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([CSVField] -> ByteString) -> CSVTable -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
map (ByteString -> [ByteString] -> ByteString
BS.intercalate ([Char] -> ByteString
BS.pack [Char
d]) ([ByteString] -> ByteString)
-> ([CSVField] -> [ByteString]) -> [CSVField] -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (CSVField -> ByteString) -> [CSVField] -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
map CSVField -> ByteString
ppField)
  where ppField :: CSVField -> ByteString
ppField CSVField
f = Bool -> ByteString -> ByteString
quoted (CSVField -> Bool
csvFieldQuoted CSVField
f) (ByteString -> ByteString
doNL (ByteString -> ByteString) -> ByteString -> ByteString
forall a b. (a -> b) -> a -> b
$ CSVField -> ByteString
csvFieldContent CSVField
f)
        doNL :: ByteString -> ByteString
doNL | Bool
nl        = ByteString -> ByteString
replaceNL
             | Bool
otherwise = ByteString -> ByteString
forall a. a -> a
id

{-
-- | Output a table back to a string, but using Haskell list-of-tuple notation
--   rather than CSV.
ppCSVTableAsTuples :: CSVTable -> String
ppCSVTableAsTuples = indent 4 . unlines . map ( (", ("++) . (++")")
                                              . intercalate ", " . map ppField )
  where ppField f = quoted (csvFieldQuoted f) (BS.unpack (csvFieldContent f))
-}

-- Some pp helpers - indent and quoted - should live elsewhere, in a
-- pretty-printing package.

indent :: Int -> String -> String
indent :: Int -> ShowS
indent Int
n = [[Char]] -> [Char]
unlines ([[Char]] -> [Char]) -> ([Char] -> [[Char]]) -> ShowS
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ShowS -> [[Char]] -> [[Char]]
forall a b. (a -> b) -> [a] -> [b]
map (Int -> Char -> [Char]
forall a. Int -> a -> [a]
replicate Int
n Char
' ' [Char] -> ShowS
forall a. [a] -> [a] -> [a]
++) ([[Char]] -> [[Char]])
-> ([Char] -> [[Char]]) -> [Char] -> [[Char]]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Char] -> [[Char]]
lines

quoted :: Bool -> ByteString -> ByteString
quoted :: Bool -> ByteString -> ByteString
quoted Bool
False  ByteString
s  = ByteString
s
quoted Bool
True   ByteString
s  = [ByteString] -> ByteString
BS.concat [[Char] -> ByteString
BS.pack [Char]
"\"", ByteString -> ByteString
escape ByteString
s, [Char] -> ByteString
BS.pack[Char]
"\""]
  where escape :: ByteString -> ByteString
escape ByteString
s = let (ByteString
good,ByteString
next) = (Char -> Bool) -> ByteString -> (ByteString, ByteString)
BS.span (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
'"') ByteString
s
                   in if ByteString -> Bool
BS.null ByteString
next then ByteString
good
                    else [ByteString] -> ByteString
BS.concat [ ByteString
good, [Char] -> ByteString
BS.pack [Char]
"\"\"", ByteString -> ByteString
escape (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
BS.tail ByteString
next) ]

replaceNL :: ByteString -> ByteString
replaceNL :: ByteString -> ByteString
replaceNL ByteString
s = let (ByteString
good,ByteString
next) = (Char -> Bool) -> ByteString -> (ByteString, ByteString)
BS.span (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
'\n') ByteString
s
              in if ByteString -> Bool
BS.null ByteString
next then ByteString
good
                 else if ByteString -> Bool
BS.null ByteString
good then ByteString -> ByteString
replaceNL (HasCallStack => ByteString -> ByteString
ByteString -> ByteString
BS.tail ByteString
next)
                 else [ByteString] -> ByteString
BS.concat [ ByteString
good, [Char] -> ByteString
BS.pack [Char]
" ", ByteString -> ByteString
replaceNL ByteString
next ]


-- | Convert a CSV table to a simpler representation, by dropping all
--   the original location information.
fromCSVTable :: CSVTable -> [[ByteString]]
fromCSVTable :: CSVTable -> [[ByteString]]
fromCSVTable = ([CSVField] -> [ByteString]) -> CSVTable -> [[ByteString]]
forall a b. (a -> b) -> [a] -> [b]
map ((CSVField -> ByteString) -> [CSVField] -> [ByteString]
forall a b. (a -> b) -> [a] -> [b]
map CSVField -> ByteString
csvFieldContent)

-- | Convert a simple list of lists into a CSVTable by the addition of
--   logical locations.  (Textual locations are not so useful.)
--   Rows of varying lengths generate errors.  Fields that need
--   quotation marks are automatically marked as such.
toCSVTable   :: [[ByteString]] -> ([CSVError], CSVTable)
toCSVTable :: [[ByteString]] -> ([CSVError], CSVTable)
toCSVTable []         = ([CSVError
NoData], [])
toCSVTable rows :: [[ByteString]]
rows@([ByteString]
r:[[ByteString]]
_) = (\ ([[CSVError]]
a,CSVTable
b)-> ([[CSVError]] -> [CSVError]
forall (t :: * -> *) a. Foldable t => t [a] -> [a]
concat [[CSVError]]
a, CSVTable
b)) (([[CSVError]], CSVTable) -> ([CSVError], CSVTable))
-> ([[CSVError]], CSVTable) -> ([CSVError], CSVTable)
forall a b. (a -> b) -> a -> b
$
                        [([CSVError], [CSVField])] -> ([[CSVError]], CSVTable)
forall a b. [(a, b)] -> ([a], [b])
unzip ((Int -> [ByteString] -> ([CSVError], [CSVField]))
-> [Int] -> [[ByteString]] -> [([CSVError], [CSVField])]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith Int -> [ByteString] -> ([CSVError], [CSVField])
walk [Int
1..] [[ByteString]]
rows)
  where
    n :: Int
n            = [ByteString] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [ByteString]
r
    walk        :: Int -> [ByteString] -> ([CSVError], CSVRow)
    walk :: Int -> [ByteString] -> ([CSVError], [CSVField])
walk Int
rnum [] = ( [Int -> CSVError
blank Int
rnum]
                   , (Int -> CSVField) -> [Int] -> [CSVField]
forall a b. (a -> b) -> [a] -> [b]
map (\Int
c-> Int -> Int -> ByteString -> CSVField
mkCSVField Int
rnum Int
c (ByteString
BS.empty)) [Int
1..Int
n])
    walk Int
rnum [ByteString]
cs = ( if [ByteString] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [ByteString]
cs Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
/= Int
n then [Int -> [ByteString] -> CSVError
bad Int
rnum [ByteString]
cs] else []
                   , (Int -> ByteString -> CSVField)
-> [Int] -> [ByteString] -> [CSVField]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith (Int -> Int -> ByteString -> CSVField
mkCSVField Int
rnum) [Int
1..Int
n] [ByteString]
cs )

    blank :: Int -> CSVError
blank Int
rnum =  BlankLine{ csvRow :: Int
csvRow          = Int
rnum
                           , csvColsExpected :: Int
csvColsExpected = Int
n
                           , csvColsActual :: Int
csvColsActual   = Int
0
                           , csvField :: CSVField
csvField        = Int -> Int -> ByteString -> CSVField
mkCSVField Int
rnum Int
0 ByteString
BS.empty
                           }
    bad :: Int -> [ByteString] -> CSVError
bad Int
r [ByteString]
cs = IncorrectRow{ csvRow :: Int
csvRow          = Int
r
                           , csvColsExpected :: Int
csvColsExpected = Int
n
                           , csvColsActual :: Int
csvColsActual   = [ByteString] -> Int
forall a. [a] -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length [ByteString]
cs
                           , csvFields :: [CSVField]
csvFields       = (Int -> ByteString -> CSVField)
-> [Int] -> [ByteString] -> [CSVField]
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith (Int -> Int -> ByteString -> CSVField
mkCSVField Int
r) [Int
1..] [ByteString]
cs
                           }


-- | Select and/or re-arrange columns from a CSV table, based on names in the
--   header row of the table.  The original header row is re-arranged too.
--   The result is either a list of column names that were not present, or
--   the (possibly re-arranged) sub-table.
selectFields :: [String] -> CSVTable -> Either [String] CSVTable
selectFields :: [[Char]] -> CSVTable -> Either [[Char]] CSVTable
selectFields [[Char]]
names CSVTable
table
    | CSVTable -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null CSVTable
table          = [[Char]] -> Either [[Char]] CSVTable
forall a b. a -> Either a b
Left [[Char]]
names
    | Bool -> Bool
not ([[Char]] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [[Char]]
missing)  = [[Char]] -> Either [[Char]] CSVTable
forall a b. a -> Either a b
Left [[Char]]
missing
    | Bool
otherwise           = CSVTable -> Either [[Char]] CSVTable
forall a b. b -> Either a b
Right (([CSVField] -> [CSVField]) -> CSVTable -> CSVTable
forall a b. (a -> b) -> [a] -> [b]
map [CSVField] -> [CSVField]
forall {b}. [b] -> [b]
select CSVTable
table)
  where
    header :: [[Char]]
header         = (CSVField -> [Char]) -> [CSVField] -> [[Char]]
forall a b. (a -> b) -> [a] -> [b]
map (ByteString -> [Char]
BS.unpack (ByteString -> [Char])
-> (CSVField -> ByteString) -> CSVField -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CSVField -> ByteString
csvFieldContent) (CSVTable -> [CSVField]
forall a. HasCallStack => [a] -> a
head CSVTable
table)
    missing :: [[Char]]
missing        = ([Char] -> Bool) -> [[Char]] -> [[Char]]
forall a. (a -> Bool) -> [a] -> [a]
filter ([Char] -> [[Char]] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`notElem` [[Char]]
header) [[Char]]
names
    reordering :: [Int]
reordering     = ([Char] -> Int) -> [[Char]] -> [Int]
forall a b. (a -> b) -> [a] -> [b]
map (Maybe Int -> Int
forall a. HasCallStack => Maybe a -> a
fromJust (Maybe Int -> Int) -> ([Char] -> Maybe Int) -> [Char] -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (\[Char]
n-> [Char] -> [[Char]] -> Maybe Int
forall a. Eq a => a -> [a] -> Maybe Int
elemIndex [Char]
n [[Char]]
header)) [[Char]]
names
    select :: [b] -> [b]
select [b]
fields  = (Int -> b) -> [Int] -> [b]
forall a b. (a -> b) -> [a] -> [b]
map ([b]
fields[b] -> Int -> b
forall a. HasCallStack => [a] -> Int -> a
!!) [Int]
reordering

-- | Validate that the columns of a table have exactly the names and
--   ordering given in the argument.
expectFields :: [String] -> CSVTable -> Either [String] CSVTable
expectFields :: [[Char]] -> CSVTable -> Either [[Char]] CSVTable
expectFields [[Char]]
names CSVTable
table
    | CSVTable -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null CSVTable
table          = [[Char]] -> Either [[Char]] CSVTable
forall a b. a -> Either a b
Left [[Char]
"CSV table is empty"]
    | Bool -> Bool
not ([[Char]] -> Bool
forall a. [a] -> Bool
forall (t :: * -> *) a. Foldable t => t a -> Bool
null [[Char]]
missing)  = [[Char]] -> Either [[Char]] CSVTable
forall a b. a -> Either a b
Left (ShowS -> [[Char]] -> [[Char]]
forall a b. (a -> b) -> [a] -> [b]
map ([Char]
"CSV table is missing field: "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++)
                                      [[Char]]
missing)
    | [[Char]]
header [[Char]] -> [[Char]] -> Bool
forall a. Eq a => a -> a -> Bool
/= [[Char]]
names     = [[Char]] -> Either [[Char]] CSVTable
forall a b. a -> Either a b
Left [[Char]
"CSV columns are in the wrong order"
                                 ,[Char]
"Expected: "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++[Char] -> [[Char]] -> [Char]
forall a. [a] -> [[a]] -> [a]
intercalate [Char]
", " [[Char]]
names
                                 ,[Char]
"Found:    "[Char] -> ShowS
forall a. [a] -> [a] -> [a]
++[Char] -> [[Char]] -> [Char]
forall a. [a] -> [[a]] -> [a]
intercalate [Char]
", " [[Char]]
header]
    | Bool
otherwise           = CSVTable -> Either [[Char]] CSVTable
forall a b. b -> Either a b
Right CSVTable
table
  where
    header :: [[Char]]
header         = (CSVField -> [Char]) -> [CSVField] -> [[Char]]
forall a b. (a -> b) -> [a] -> [b]
map (ByteString -> [Char]
BS.unpack (ByteString -> [Char])
-> (CSVField -> ByteString) -> CSVField -> [Char]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. CSVField -> ByteString
csvFieldContent) (CSVTable -> [CSVField]
forall a. HasCallStack => [a] -> a
head CSVTable
table)
    missing :: [[Char]]
missing        = ([Char] -> Bool) -> [[Char]] -> [[Char]]
forall a. (a -> Bool) -> [a] -> [a]
filter ([Char] -> [[Char]] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`notElem` [[Char]]
header) [[Char]]
names

-- | A join operator, adds the columns of two tables together.
--   Precondition: the tables have the same number of rows.
joinCSV :: CSVTable -> CSVTable -> CSVTable
joinCSV :: CSVTable -> CSVTable -> CSVTable
joinCSV = ([CSVField] -> [CSVField] -> [CSVField])
-> CSVTable -> CSVTable -> CSVTable
forall a b c. (a -> b -> c) -> [a] -> [b] -> [c]
zipWith [CSVField] -> [CSVField] -> [CSVField]
forall a. [a] -> [a] -> [a]
(++)

-- | A generator for a new CSV column, of arbitrary length.
--   The result can be joined to an existing table if desired.
mkEmptyColumn :: String -> CSVTable
mkEmptyColumn :: [Char] -> CSVTable
mkEmptyColumn [Char]
header = [CSVField
headField] [CSVField] -> CSVTable -> CSVTable
forall a. a -> [a] -> [a]
: (Int -> [CSVField]) -> [Int] -> CSVTable
forall a b. (a -> b) -> [a] -> [b]
map ((CSVField -> [CSVField] -> [CSVField]
forall a. a -> [a] -> [a]
:[])(CSVField -> [CSVField]) -> (Int -> CSVField) -> Int -> [CSVField]
forall b c a. (b -> c) -> (a -> b) -> a -> c
.Int -> CSVField
emptyField) [Int
2..]
  where
    headField :: CSVField
headField = (Int -> CSVField
emptyField Int
1) { csvFieldContent = BS.pack header
                               , csvFieldQuoted  = True }
    emptyField :: Int -> CSVField
emptyField Int
n = CSVField { csvRowNum :: Int
csvRowNum       = Int
n
                            , csvColNum :: Int
csvColNum       = Int
0
                            , csvTextStart :: (Int, Int)
csvTextStart    = (Int
0,Int
0)
                            , csvTextEnd :: (Int, Int)
csvTextEnd      = (Int
0,Int
0)
                            , csvFieldContent :: ByteString
csvFieldContent = ByteString
BS.empty
                            , csvFieldQuoted :: Bool
csvFieldQuoted  = Bool
False
                            }

-- | Generate a fresh field with the given textual content.
--   The quoting flag is set automatically based on the text.
--   Textual extents are not particularly useful, since there was no original
--   input to refer to.
mkCSVField :: Int -> Int -> ByteString -> CSVField
mkCSVField :: Int -> Int -> ByteString -> CSVField
mkCSVField Int
n Int
c ByteString
text =
        CSVField { csvRowNum :: Int
csvRowNum       = Int
n
                 , csvColNum :: Int
csvColNum       = Int
c
                 , csvTextStart :: (Int, Int)
csvTextStart    = (Int
0,Int
0)
                 , csvTextEnd :: (Int, Int)
csvTextEnd      = ( Int64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral
                                             (Int64 -> Int) -> (ByteString -> Int64) -> ByteString -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Int64
BS.length 
                                             (ByteString -> Int64)
-> (ByteString -> ByteString) -> ByteString -> Int64
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> ByteString -> ByteString
BS.filter (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
==Char
'\n')
                                             (ByteString -> Int) -> ByteString -> Int
forall a b. (a -> b) -> a -> b
$ ByteString
text
                                     , Int64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral
                                             (Int64 -> Int) -> (ByteString -> Int64) -> ByteString -> Int
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> Int64
BS.length
                                             (ByteString -> Int64)
-> (ByteString -> ByteString) -> ByteString -> Int64
forall b c a. (b -> c) -> (a -> b) -> a -> c
. (Char -> Bool) -> ByteString -> ByteString
BS.takeWhile (Char -> Char -> Bool
forall a. Eq a => a -> a -> Bool
/=Char
'\n')
                                             (ByteString -> ByteString)
-> (ByteString -> ByteString) -> ByteString -> ByteString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ByteString -> ByteString
BS.reverse (ByteString -> Int) -> ByteString -> Int
forall a b. (a -> b) -> a -> b
$ ByteString
text )
                 , csvFieldContent :: ByteString
csvFieldContent = ByteString
text
                 , csvFieldQuoted :: Bool
csvFieldQuoted  = (Char -> Bool) -> [Char] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
any (Char -> [Char] -> Bool
forall a. Eq a => a -> [a] -> Bool
forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool
`elem`[Char]
"\",\n\r") (ByteString -> [Char]
BS.unpack ByteString
text)
                 }