csv.pl

View source with formatted comments or as raw
    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2017, VU University Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(csv,
   36          [ csv//1,                     % +Rows
   37            csv//2,                     % +Rows, +Options
   38
   39            csv_read_file/2,            % +File, -Data
   40            csv_read_file/3,            % +File, -Data, +Options
   41            csv_read_file_row/3,        % +File, -Row, +Options
   42            csv_read_row/3,		% +Stream, -Row, +CompiledOptions
   43            csv_options/2,		% -Compiled, +Options
   44
   45            csv_write_file/2,           % +File, +Data
   46            csv_write_file/3,           % +File, +Data, +Options
   47            csv_write_stream/3          % +Stream, +Data, +Options
   48          ]).   49:- use_module(library(record)).   50:- use_module(library(error)).   51:- use_module(library(pure_input)).   52:- use_module(library(debug)).   53:- use_module(library(option)).   54
   55/** <module> Process CSV (Comma-Separated Values) data
   56
   57This library parses and generates CSV data.   CSV data is represented in
   58Prolog as a list of rows. Each row   is  a compound term, where all rows
   59have the same name and arity.
   60
   61@tbd    Implement immediate assert of the data to avoid possible stack
   62        overflows.
   63@tbd    Writing creates an intermediate code-list, possibly overflowing
   64        resources.  This waits for pure output!
   65@see RFC 4180
   66*/
   67
   68:- predicate_options(csv//2, 2,
   69                     [ separator(nonneg),       % mustv be code
   70                       strip(boolean),
   71                       ignore_quotes(boolean),
   72                       convert(boolean),
   73                       case(oneof([down,preserve,up])),
   74                       functor(atom),
   75                       arity(-nonneg),          % actually ?nonneg
   76                       match_arity(boolean)
   77                     ]).   78:- predicate_options(csv_read_file/3, 3,
   79                     [ pass_to(csv//2, 2),
   80                       pass_to(phrase_from_file/3, 3)
   81                     ]).   82:- predicate_options(csv_read_file_row/3, 3,
   83                     [ pass_to(csv//2, 2),
   84                       pass_to(open/4, 4)
   85                     ]).   86:- predicate_options(csv_write_file/3, 3,
   87                     [ pass_to(csv//2, 2),
   88                       pass_to(open/4, 4)
   89                     ]).   90:- predicate_options(csv_write_stream/3, 3,
   91                     [ pass_to(csv//2, 2)
   92                     ]).   93
   94
   95:- record
   96    csv_options(separator:integer=0',,
   97                strip:boolean=false,
   98                ignore_quotes:boolean=false,
   99                convert:boolean=true,
  100                case:oneof([down,preserve,up])=preserve,
  101                functor:atom=row,
  102                arity:integer,
  103                match_arity:boolean=true).  104
  105
  106%!  csv_read_file(+File, -Rows) is det.
  107%!  csv_read_file(+File, -Rows, +Options) is det.
  108%
  109%   Read a CSV file into a list of   rows. Each row is a Prolog term
  110%   with the same arity. Options  is   handed  to  csv//2. Remaining
  111%   options  are  processed  by    phrase_from_file/3.  The  default
  112%   separator depends on the file name   extension and is =|\t|= for
  113%   =|.tsv|= files and =|,|= otherwise.
  114%
  115%   Suppose we want to create a predicate   table/6  from a CSV file
  116%   that we know contains 6 fields  per   record.  This  can be done
  117%   using the code below. Without the   option  arity(6), this would
  118%   generate a predicate table/N, where N   is  the number of fields
  119%   per record in the data.
  120%
  121%       ==
  122%       ?- csv_read_file(File, Rows, [functor(table), arity(6)]),
  123%          maplist(assert, Rows).
  124%       ==
  125
  126
  127csv_read_file(File, Rows) :-
  128    csv_read_file(File, Rows, []).
  129
  130csv_read_file(File, Rows, Options) :-
  131    default_separator(File, Options, Options1),
  132    make_csv_options(Options1, Record, RestOptions),
  133    phrase_from_file(csv_roptions(Rows, Record), File, RestOptions).
  134
  135
  136default_separator(File, Options0, Options) :-
  137    (   option(separator(_), Options0)
  138    ->  Options = Options0
  139    ;   file_name_extension(_, Ext0, File),
  140        downcase_atom(Ext0, Ext),
  141        ext_separator(Ext, Sep)
  142    ->  Options = [separator(Sep)|Options0]
  143    ;   Options = Options0
  144    ).
  145
  146ext_separator(csv, 0',).
  147ext_separator(tsv, 0'\t).
  148
  149
  150%!  csv(?Rows)// is det.
  151%!  csv(?Rows, +Options)// is det.
  152%
  153%   Prolog DCG to `read/write' CSV data.  Options:
  154%
  155%       * separator(+Code)
  156%       The comma-separator.  Must be a character code.  Default is
  157%       (of course) the comma. Character codes can be specified
  158%       using the 0' notion. E.g., using =|separator(0';)|= parses
  159%       a semicolon separated file.
  160%
  161%       * ignore_quotes(+Boolean)
  162%       If =true= (default false), threat double quotes as a normal
  163%       character.
  164%
  165%       * strip(+Boolean)
  166%       If =true= (default =false=), strip leading and trailing
  167%       blank space.  RFC4180 says that blank space is part of the
  168%       data.
  169%
  170%       * convert(+Boolean)
  171%       If =true= (default), use name/2 on the field data.  This
  172%       translates the field into a number if possible.
  173%
  174%       * case(+Action)
  175%       If =down=, downcase atomic values.  If =up=, upcase them
  176%       and if =preserve= (default), do not change the case.
  177%
  178%       * functor(+Atom)
  179%       Functor to use for creating row terms.  Default is =row=.
  180%
  181%       * arity(?Arity)
  182%       Number of fields in each row.  This predicate raises
  183%       a domain_error(row_arity(Expected), Found) if a row is
  184%       found with different arity.
  185%
  186%       * match_arity(+Boolean)
  187%       If =false= (default =true=), do not reject CSV files where
  188%       lines provide a varying number of fields (columns).  This
  189%       can be a work-around to use some incorrect CSV files.
  190
  191csv(Rows) -->
  192    csv(Rows, []).
  193
  194csv(Rows, Options) -->
  195    { make_csv_options(Options, Record, _) },
  196    csv_roptions(Rows, Record).
  197
  198csv_roptions(Rows, Record) -->
  199    { ground(Rows) },
  200    !,
  201    emit_csv(Rows, Record).
  202csv_roptions(Rows, Record) -->
  203    csv_data(Rows, Record).
  204
  205csv_data([], _) -->
  206    eof,
  207    !.
  208csv_data([Row|More], Options) -->
  209    row(Row, Options),
  210    !,
  211    { debug(csv, 'Row: ~p', [Row]) },
  212    csv_data(More, Options).
  213
  214eof([], []).
  215
  216row(Row, Options) -->
  217    fields(Fields, Options),
  218    { csv_options_functor(Options, Functor),
  219      Row =.. [Functor|Fields],
  220      functor(Row, _, Arity),
  221      check_arity(Options, Arity)
  222    }.
  223
  224check_arity(Options, Arity) :-
  225    csv_options_arity(Options, Arity),
  226    !.
  227check_arity(Options, _) :-
  228    csv_options_match_arity(Options, false),
  229    !.
  230check_arity(Options, Arity) :-
  231    csv_options_arity(Options, Expected),
  232    domain_error(row_arity(Expected), Arity).
  233
  234fields([F|T], Options) -->
  235    field(F, Options),
  236    (   separator(Options)
  237    ->  fields(T, Options)
  238    ;   end_of_record
  239    ->  { T = [] }
  240    ).
  241
  242field(Value, Options) -->
  243    "\"",
  244    { csv_options_ignore_quotes(Options, false) },
  245    !,
  246    string_codes(Codes),
  247    { make_value(Codes, Value, Options) }.
  248field(Value, Options) -->
  249    { csv_options_strip(Options, true) },
  250    !,
  251    stripped_field(Value, Options).
  252field(Value, Options) -->
  253    { csv_options_separator(Options, Sep) },
  254    field_codes(Codes, Sep),
  255    { make_value(Codes, Value, Options) }.
  256
  257
  258stripped_field(Value, Options) -->
  259    ws,
  260    (   "\"",
  261        { csv_options_strip(Options, false) }
  262    ->  string_codes(Codes),
  263        ws
  264    ;   { csv_options_separator(Options, Sep) },
  265        field_codes(Codes0, Sep),
  266        { strip_trailing_ws(Codes0, Codes) }
  267    ),
  268    { make_value(Codes, Value, Options) }.
  269
  270ws --> " ", !, ws.
  271ws --> "\t", !, ws.
  272ws --> "".
  273
  274strip_trailing_ws(List, Stripped) :-
  275    append(Stripped, WS, List),
  276    all_ws(WS).
  277
  278all_ws([]).
  279all_ws([32|T]) :- all_ws(T).
  280all_ws([9|T]) :- all_ws(T).
  281
  282
  283%!  string_codes(-Codes)
  284%
  285%   Process a double-quotes string where  the   quote  is escaped by
  286%   doubling it. Eats the terminating double-quote.
  287
  288string_codes(List) -->
  289    [H],
  290    (   { H == 0'" }
  291    ->  (   "\""
  292        ->  { List = [H|T] },
  293            string_codes(T)
  294        ;   { List = [] }
  295        )
  296    ;   { List = [H|T] },
  297        string_codes(T)
  298    ).
  299
  300field_codes([], Sep), [Sep] --> [Sep], !.
  301field_codes([], _), "\n" --> "\r\n", !.
  302field_codes([], _), "\n" --> "\n", !.
  303field_codes([H|T], Sep) --> [H], !, field_codes(T, Sep).
  304field_codes([], _) --> [].              % unterminated last record
  305
  306%!  make_value(+Codes, -Value, +Options) is det.
  307%
  308%   Convert a list of character codes to the actual value, depending
  309%   on Options.
  310
  311make_value(Codes, Value, Options) :-
  312    csv_options_convert(Options, Convert),
  313    csv_options_case(Options, Case),
  314    make_value(Convert, Case, Codes, Value).
  315
  316make_value(true, preserve, Codes, Value) :-
  317    !,
  318    name(Value, Codes).
  319make_value(true, Case, Codes, Value) :-
  320    !,
  321    (   number_string(Value, Codes)
  322    ->  true
  323    ;   make_value(false, Case, Codes, Value)
  324    ).
  325make_value(false, preserve, Codes, Value) :-
  326    !,
  327    atom_codes(Value, Codes).
  328make_value(false, down, Codes, Value) :-
  329    !,
  330    string_codes(String, Codes),
  331    downcase_atom(String, Value).
  332make_value(false, up, Codes, Value) :-
  333    string_codes(String, Codes),
  334    upcase_atom(String, Value).
  335
  336separator(Options) -->
  337    { csv_options_separator(Options, Sep) },
  338    [Sep].
  339
  340end_of_record --> "\n".			% Unix files
  341end_of_record --> "\r\n".               % DOS files
  342end_of_record --> "\r".                 % MacOS files
  343end_of_record --> eof.                  % unterminated last record
  344
  345
  346%!  csv_read_file_row(+File, -Row, +Options) is nondet.
  347%
  348%   True when Row is a row in File.  First unifies Row with the first
  349%   row in File. Backtracking  yields  the   second,  ...  row.  This
  350%   interface  is  an  alternative  to  csv_read_file/3  that  avoids
  351%   loading all rows in memory.  Note   that  this interface does not
  352%   guarantee that all rows in File have the same arity.
  353%
  354%   In addition to the  options   of  csv_read_file/3, this predicate
  355%   processes the option:
  356%
  357%     * line(-Line)
  358%     Line is unified with the 1-based line-number from which Row is
  359%     read.  Note that Line is not the physical line, but rather the
  360%     _logical_ record number.
  361%
  362%   @tbd    Input is read line by line.  If a record separator is
  363%           embedded in a quoted field, parsing the record fails and
  364%           another line is added to the input.  This does not nicely
  365%           deal with other reasons why parsing the row may fail.
  366
  367csv_read_file_row(File, Row, Options) :-
  368    default_separator(File, Options, Options1),
  369    make_csv_options(Options1, RecordOptions, Options2),
  370    select_option(line(Line), Options2, RestOptions, _),
  371    setup_call_cleanup(
  372        open(File, read, Stream, RestOptions),
  373        csv_read_stream_row(Stream, Row, Line, RecordOptions),
  374        close(Stream)).
  375
  376csv_read_stream_row(Stream, Row, Line, Options) :-
  377    between(1, infinite, Line),
  378    (   csv_read_row(Stream, Row0, Options),
  379        Row0 \== end_of_file
  380    ->  Row = Row0
  381    ;   !,
  382        fail
  383    ).
  384
  385
  386%!  csv_read_row(+Stream, -Row, +CompiledOptions) is det.
  387%
  388%   Read the next CSV record from Stream  and unify the result with Row.
  389%   CompiledOptions is created from  options   defined  for csv//2 using
  390%   csv_options/2. Row is unified with   `end_of_file` upon reaching the
  391%   end of the input.
  392
  393csv_read_row(Stream, Row, _Record) :-
  394    at_end_of_stream(Stream),
  395    !,
  396    Row = end_of_file.
  397csv_read_row(Stream, Row, Record) :-
  398    read_lines_to_codes(Stream, Codes),
  399    phrase(row(Row0, Record), Codes),
  400    !,
  401    Row = Row0.
  402
  403read_lines_to_codes(Stream, Codes) :-
  404    read_line_to_codes(Stream, Codes, Tail),
  405    (   Tail == []
  406    ->  true
  407    ;   Tail = []
  408    ;   read_lines_to_codes(Stream, Tail)
  409    ).
  410
  411
  412%!  csv_options(-Compiled, +Options) is det.
  413%
  414%   Compiled is the  compiled  representation   of  the  CSV  processing
  415%   options as they may be passed into   csv//2,  etc. This predicate is
  416%   used in combination with csv_read_row/3 to avoid repeated processing
  417%   of the options.
  418
  419csv_options(Compiled, Options) :-
  420    make_csv_options(Options, Compiled, _Ignored).
  421
  422
  423                /*******************************
  424                *             OUTPUT           *
  425                *******************************/
  426
  427%!  csv_write_file(+File, +Data) is det.
  428%!  csv_write_file(+File, +Data, +Options) is det.
  429%
  430%   Write a list of Prolog terms to a CSV file.  Options are given
  431%   to csv//2.  Remaining options are given to open/4.  The  default
  432%   separator depends on the file name   extension and is =|\t|= for
  433%   =|.tsv|= files and =|,|= otherwise.
  434
  435csv_write_file(File, Data) :-
  436    csv_write_file(File, Data, []).
  437
  438csv_write_file(File, Data, Options) :-
  439    must_be(list, Data),
  440    default_separator(File, Options, Options1),
  441    make_csv_options(Options1, Record, RestOptions),
  442    phrase(emit_csv(Data, Record), String),
  443    setup_call_cleanup(
  444        open(File, write, Out, RestOptions),
  445        format(Out, '~s', [String]),
  446        close(Out)).
  447
  448
  449emit_csv([], _) --> [].
  450emit_csv([H|T], Options) -->
  451    emit_row(H, Options), "\r\n",   % RFC 4180 demands \r\n
  452    emit_csv(T, Options).
  453
  454emit_row(Row, Options) -->
  455    { Row =.. [_|Fields] },
  456    emit_fields(Fields, Options).
  457
  458emit_fields([H|T], Options) -->
  459    emit_field(H, Options),
  460    (   { T == [] }
  461        ->  []
  462        ;   { csv_options_separator(Options, Sep) },
  463        [Sep],
  464        emit_fields(T, Options)
  465    ).
  466
  467emit_field(H, Options) -->
  468    { (   atom(H)
  469      ->  atom_codes(H, Codes)
  470      ;   string(H)
  471      ->  string_codes(H, Codes)
  472      )
  473    },
  474    !,
  475    (   { needs_quotes(H, Options) }
  476    ->  "\"", emit_string(Codes), "\""
  477    ;   emit_codes(Codes)
  478    ).
  479emit_field([], _) -->
  480    !,
  481    { atom_codes('[]', Codes) },
  482    emit_codes(Codes).
  483emit_field(H, _) -->
  484    { number_codes(H,Codes) },
  485    emit_codes(Codes).
  486
  487needs_quotes(Atom, _) :-
  488    sub_atom(Atom, _, _, _, '"'),
  489    !.
  490needs_quotes(Atom, _) :-
  491    sub_atom(Atom, _, _, _, '\n'),
  492    !.
  493needs_quotes(Atom, _) :-
  494    sub_atom(Atom, _, _, _, '\r'),
  495    !.
  496needs_quotes(Atom, Options) :-
  497    csv_options_separator(Options, Sep),
  498    char_code(Char, Sep),
  499    sub_atom(Atom, _, _, _, Char),
  500    !.
  501
  502emit_string([]) --> "".
  503emit_string([0'"|T]) --> !, "\"\"", emit_string(T).
  504emit_string([H|T]) --> [H], emit_string(T).
  505
  506emit_codes([]) --> "".
  507emit_codes([0'"|T]) --> !, "\"\"", emit_codes(T).
  508emit_codes([H|T]) --> [H], emit_codes(T).
  509
  510
  511%%     csv_write_stream(+Stream, +Data, +Options) is det.
  512%
  513%      Write  the  rows  in  Data  to    Stream.   This  is  similar  to
  514%      csv_write_file/3,  but  can  deal  with  data  that  is  produced
  515%      incrementally. The example  below  saves   all  answers  from the
  516%      predicate data/3 to File.
  517%
  518%        ==
  519%        save_data(File) :-
  520%           setup_call_cleanup(
  521%               open(File, write, Out),
  522%               forall(data(C1,C2,C3),
  523%                      csv_write_stream(Out, [row(C1,C2,C3)], [])),
  524%               close(Out)),
  525%        ==
  526
  527csv_write_stream(Stream, Data, Options) :-
  528    must_be(list, Data),
  529    make_csv_options(Options, Record, _),
  530    phrase(emit_csv(Data, Record), String),
  531    format(Stream, '~s', [String])