View source with raw comments or as raw

    1/*  Part of SWI-Prolog
    2
    3    Author:        Jan Wielemaker
    4    E-mail:        J.Wielemaker@vu.nl
    5    WWW:           http://www.swi-prolog.org
    6    Copyright (c)  2009-2017, VU University Amsterdam
    7    All rights reserved.
    8
    9    Redistribution and use in source and binary forms, with or without
   10    modification, are permitted provided that the following conditions
   11    are met:
   12
   13    1. Redistributions of source code must retain the above copyright
   14       notice, this list of conditions and the following disclaimer.
   15
   16    2. Redistributions in binary form must reproduce the above copyright
   17       notice, this list of conditions and the following disclaimer in
   18       the documentation and/or other materials provided with the
   19       distribution.
   20
   21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
   24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
   25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
   26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
   27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
   29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
   30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
   31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
   32    POSSIBILITY OF SUCH DAMAGE.
   33*/
   34
   35:- module(csv,
   36          [ csv//1,                     % +Rows
   37            csv//2,                     % +Rows, +Options
   38
   39            csv_read_file/2,            % +File, -Data
   40            csv_read_file/3,            % +File, -Data, +Options
   41            csv_read_file_row/3,        % +File, -Row, +Options
   42            csv_read_row/3,		% +Stream, -Row, +CompiledOptions
   43            csv_options/2,		% -Compiled, +Options
   44
   45            csv_write_file/2,           % +File, +Data
   46            csv_write_file/3,           % +File, +Data, +Options
   47            csv_write_stream/3          % +Stream, +Data, +Options
   48          ]).   49:- use_module(library(record)).   50:- use_module(library(error)).   51:- use_module(library(pure_input)).   52:- use_module(library(debug)).   53:- use_module(library(option)).

Process CSV (Comma-Separated Values) data

This library parses and generates CSV data. CSV data is represented in Prolog as a list of rows. Each row is a compound term, where all rows have the same name and arity.

See also: - RFC 4180 */
To be done: - Implement immediate assert of the data to avoid possible stack overflows.; - Writing creates an intermediate code-list, possibly overflowing resources. This waits for pure output!

   68:- predicate_options(csv//2, 2,
   69                     [ separator(nonneg),       % mustv be code
   70                       strip(boolean),
   71                       ignore_quotes(boolean),
   72                       convert(boolean),
   73                       case(oneof([down,preserve,up])),
   74                       functor(atom),
   75                       arity(-nonneg),          % actually ?nonneg
   76                       match_arity(boolean)
   77                     ]).   78:- predicate_options(csv_read_file/3, 3,
   79                     [ pass_to(csv//2, 2),
   80                       pass_to(phrase_from_file/3, 3)
   81                     ]).   82:- predicate_options(csv_read_file_row/3, 3,
   83                     [ pass_to(csv//2, 2),
   84                       pass_to(open/4, 4)
   85                     ]).   86:- predicate_options(csv_write_file/3, 3,
   87                     [ pass_to(csv//2, 2),
   88                       pass_to(open/4, 4)
   89                     ]).   90:- predicate_options(csv_write_stream/3, 3,
   91                     [ pass_to(csv//2, 2)
   92                     ]).   93
   94
   95:- record
   96    csv_options(separator:integer=0',,
   97                strip:boolean=false,
   98                ignore_quotes:boolean=false,
   99                convert:boolean=true,
  100                case:oneof([down,preserve,up])=preserve,
  101                functor:atom=row,
  102                arity:integer,
  103                match_arity:boolean=true).

csv_read_file(+File, -Rows) is det

csv_read_file(+File, -Rows, +Options) is det

Read a CSV file into a list of rows. Each row is a Prolog term with the same arity. Options is handed to csv//2. Remaining options are processed by phrase_from_file/3. The default separator depends on the file name extension and is \t for .tsv files and , otherwise.

Suppose we want to create a predicate table/6 from a CSV file that we know contains 6 fields per record. This can be done using the code below. Without the option arity(6), this would generate a predicate table/N, where N is the number of fields per record in the data.

?- csv_read_file(File, Rows, [functor(table), arity(6)]),
   maplist(assert, Rows).

  127csv_read_file(File, Rows) :-
  128    csv_read_file(File, Rows, []).
  129
  130csv_read_file(File, Rows, Options) :-
  131    default_separator(File, Options, Options1),
  132    make_csv_options(Options1, Record, RestOptions),
  133    phrase_from_file(csv_roptions(Rows, Record), File, RestOptions).
  134
  135
  136default_separator(File, Options0, Options) :-
  137    (   option(separator(_), Options0)
  138    ->  Options = Options0
  139    ;   file_name_extension(_, Ext0, File),
  140        downcase_atom(Ext0, Ext),
  141        ext_separator(Ext, Sep)
  142    ->  Options = [separator(Sep)|Options0]
  143    ;   Options = Options0
  144    ).
  145
  146ext_separator(csv, 0',).
  147ext_separator(tsv, 0'\t).

csv(?Rows)// is det

csv(?Rows, +Options)// is det

Prolog DCG to `read/write' CSV data. Options:

separator(+Code): The comma-separator. Must be a character code. Default is (of course) the comma. Character codes can be specified using the 0' notion. E.g., using separator(0';) parses a semicolon separated file.
ignore_quotes(+Boolean): If true (default false), threat double quotes as a normal character.
strip(+Boolean): If true (default false), strip leading and trailing blank space. RFC4180 says that blank space is part of the data.
convert(+Boolean): If true (default), use name/2 on the field data. This translates the field into a number if possible.
case(+Action): If down, downcase atomic values. If up, upcase them and if preserve (default), do not change the case.
functor(+Atom): Functor to use for creating row terms. Default is row.
arity(?Arity): Number of fields in each row. This predicate raises a domain_error(row_arity(Expected), Found) if a row is found with different arity.
match_arity(+Boolean): If false (default true), do not reject CSV files where lines provide a varying number of fields (columns). This can be a work-around to use some incorrect CSV files.

  191csv(Rows) -->
  192    csv(Rows, []).
  193
  194csv(Rows, Options) -->
  195    { make_csv_options(Options, Record, _) },
  196    csv_roptions(Rows, Record).
  197
  198csv_roptions(Rows, Record) -->
  199    { ground(Rows) },
  200    !,
  201    emit_csv(Rows, Record).
  202csv_roptions(Rows, Record) -->
  203    csv_data(Rows, Record).
  204
  205csv_data([], _) -->
  206    eof,
  207    !.
  208csv_data([Row|More], Options) -->
  209    row(Row, Options),
  210    !,
  211    { debug(csv, 'Row: ~p', [Row]) },
  212    csv_data(More, Options).
  213
  214eof([], []).
  215
  216row(Row, Options) -->
  217    fields(Fields, Options),
  218    { csv_options_functor(Options, Functor),
  219      Row =.. [Functor|Fields],
  220      functor(Row, _, Arity),
  221      check_arity(Options, Arity)
  222    }.
  223
  224check_arity(Options, Arity) :-
  225    csv_options_arity(Options, Arity),
  226    !.
  227check_arity(Options, _) :-
  228    csv_options_match_arity(Options, false),
  229    !.
  230check_arity(Options, Arity) :-
  231    csv_options_arity(Options, Expected),
  232    domain_error(row_arity(Expected), Arity).
  233
  234fields([F|T], Options) -->
  235    field(F, Options),
  236    (   separator(Options)
  237    ->  fields(T, Options)
  238    ;   end_of_record
  239    ->  { T = [] }
  240    ).
  241
  242field(Value, Options) -->
  243    "\"",
  244    { csv_options_ignore_quotes(Options, false) },
  245    !,
  246    string_codes(Codes),
  247    { make_value(Codes, Value, Options) }.
  248field(Value, Options) -->
  249    { csv_options_strip(Options, true) },
  250    !,
  251    stripped_field(Value, Options).
  252field(Value, Options) -->
  253    { csv_options_separator(Options, Sep) },
  254    field_codes(Codes, Sep),
  255    { make_value(Codes, Value, Options) }.
  256
  257
  258stripped_field(Value, Options) -->
  259    ws,
  260    (   "\"",
  261        { csv_options_strip(Options, false) }
  262    ->  string_codes(Codes),
  263        ws
  264    ;   { csv_options_separator(Options, Sep) },
  265        field_codes(Codes0, Sep),
  266        { strip_trailing_ws(Codes0, Codes) }
  267    ),
  268    { make_value(Codes, Value, Options) }.
  269
  270ws --> " ", !, ws.
  271ws --> "\t", !, ws.
  272ws --> "".
  273
  274strip_trailing_ws(List, Stripped) :-
  275    append(Stripped, WS, List),
  276    all_ws(WS).
  277
  278all_ws([]).
  279all_ws([32|T]) :- all_ws(T).
  280all_ws([9|T]) :- all_ws(T).

string_codes(-Codes): Process a double-quotes string where the quote is escaped by doubling it. Eats the terminating double-quote.

  288string_codes(List) -->
  289    [H],
  290    (   { H == 0'" }
  291    ->  (   "\""
  292        ->  { List = [H|T] },
  293            string_codes(T)
  294        ;   { List = [] }
  295        )
  296    ;   { List = [H|T] },
  297        string_codes(T)
  298    ).
  299
  300field_codes([], Sep), [Sep] --> [Sep], !.
  301field_codes([], _), "\n" --> "\r\n", !.
  302field_codes([], _), "\n" --> "\n", !.
  303field_codes([H|T], Sep) --> [H], !, field_codes(T, Sep).
  304field_codes([], _) --> [].              % unterminated last record

make_value(+Codes, -Value, +Options) is det: Convert a list of character codes to the actual value, depending on Options.

  311make_value(Codes, Value, Options) :-
  312    csv_options_convert(Options, Convert),
  313    csv_options_case(Options, Case),
  314    make_value(Convert, Case, Codes, Value).
  315
  316make_value(true, preserve, Codes, Value) :-
  317    !,
  318    name(Value, Codes).
  319make_value(true, Case, Codes, Value) :-
  320    !,
  321    (   number_string(Value, Codes)
  322    ->  true
  323    ;   make_value(false, Case, Codes, Value)
  324    ).
  325make_value(false, preserve, Codes, Value) :-
  326    !,
  327    atom_codes(Value, Codes).
  328make_value(false, down, Codes, Value) :-
  329    !,
  330    string_codes(String, Codes),
  331    downcase_atom(String, Value).
  332make_value(false, up, Codes, Value) :-
  333    string_codes(String, Codes),
  334    upcase_atom(String, Value).
  335
  336separator(Options) -->
  337    { csv_options_separator(Options, Sep) },
  338    [Sep].
  339
  340end_of_record --> "\n".			% Unix files
  341end_of_record --> "\r\n".               % DOS files
  342end_of_record --> "\r".                 % MacOS files
  343end_of_record --> eof.                  % unterminated last record

csv_read_file_row(+File, -Row, +Options) is nondet

True when Row is a row in File. First unifies Row with the first row in File. Backtracking yields the second, ... row. This interface is an alternative to csv_read_file/3 that avoids loading all rows in memory. Note that this interface does not guarantee that all rows in File have the same arity.

In addition to the options of csv_read_file/3, this predicate processes the option:

line(-Line): Line is unified with the 1-based line-number from which Row is read. Note that Line is not the physical line, but rather the logical record number.

To be done: - Input is read line by line. If a record separator is embedded in a quoted field, parsing the record fails and another line is added to the input. This does not nicely deal with other reasons why parsing the row may fail.

  367csv_read_file_row(File, Row, Options) :-
  368    default_separator(File, Options, Options1),
  369    make_csv_options(Options1, RecordOptions, Options2),
  370    select_option(line(Line), Options2, RestOptions, _),
  371    setup_call_cleanup(
  372        open(File, read, Stream, RestOptions),
  373        csv_read_stream_row(Stream, Row, Line, RecordOptions),
  374        close(Stream)).
  375
  376csv_read_stream_row(Stream, Row, Line, Options) :-
  377    between(1, infinite, Line),
  378    (   csv_read_row(Stream, Row0, Options),
  379        Row0 \== end_of_file
  380    ->  Row = Row0
  381    ;   !,
  382        fail
  383    ).

csv_read_row(+Stream, -Row, +CompiledOptions) is det: Read the next CSV record from Stream and unify the result with Row. CompiledOptions is created from options defined for csv//2 using csv_options/2. Row is unified with end_of_file upon reaching the end of the input.

  393csv_read_row(Stream, Row, _Record) :-
  394    at_end_of_stream(Stream),
  395    !,
  396    Row = end_of_file.
  397csv_read_row(Stream, Row, Record) :-
  398    read_lines_to_codes(Stream, Codes),
  399    phrase(row(Row0, Record), Codes),
  400    !,
  401    Row = Row0.
  402
  403read_lines_to_codes(Stream, Codes) :-
  404    read_line_to_codes(Stream, Codes, Tail),
  405    (   Tail == []
  406    ->  true
  407    ;   Tail = []
  408    ;   read_lines_to_codes(Stream, Tail)
  409    ).

csv_options(-Compiled, +Options) is det: Compiled is the compiled representation of the CSV processing options as they may be passed into csv//2, etc. This predicate is used in combination with csv_read_row/3 to avoid repeated processing of the options.

  419csv_options(Compiled, Options) :-
  420    make_csv_options(Options, Compiled, _Ignored).
  421
  422
  423                /*******************************
  424                *             OUTPUT           *
  425                *******************************/

csv_write_file(+File, +Data) is det
csv_write_file(+File, +Data, +Options) is det: Write a list of Prolog terms to a CSV file. Options are given to csv//2. Remaining options are given to open/4. The default separator depends on the file name extension and is \t for .tsv files and , otherwise.

  435csv_write_file(File, Data) :-
  436    csv_write_file(File, Data, []).
  437
  438csv_write_file(File, Data, Options) :-
  439    must_be(list, Data),
  440    default_separator(File, Options, Options1),
  441    make_csv_options(Options1, Record, RestOptions),
  442    phrase(emit_csv(Data, Record), String),
  443    setup_call_cleanup(
  444        open(File, write, Out, RestOptions),
  445        format(Out, '~s', [String]),
  446        close(Out)).
  447
  448
  449emit_csv([], _) --> [].
  450emit_csv([H|T], Options) -->
  451    emit_row(H, Options), "\r\n",   % RFC 4180 demands \r\n
  452    emit_csv(T, Options).
  453
  454emit_row(Row, Options) -->
  455    { Row =.. [_|Fields] },
  456    emit_fields(Fields, Options).
  457
  458emit_fields([H|T], Options) -->
  459    emit_field(H, Options),
  460    (   { T == [] }
  461        ->  []
  462        ;   { csv_options_separator(Options, Sep) },
  463        [Sep],
  464        emit_fields(T, Options)
  465    ).
  466
  467emit_field(H, Options) -->
  468    { (   atom(H)
  469      ->  atom_codes(H, Codes)
  470      ;   string(H)
  471      ->  string_codes(H, Codes)
  472      )
  473    },
  474    !,
  475    (   { needs_quotes(H, Options) }
  476    ->  "\"", emit_string(Codes), "\""
  477    ;   emit_codes(Codes)
  478    ).
  479emit_field([], _) -->
  480    !,
  481    { atom_codes('[]', Codes) },
  482    emit_codes(Codes).
  483emit_field(H, _) -->
  484    { number_codes(H,Codes) },
  485    emit_codes(Codes).
  486
  487needs_quotes(Atom, _) :-
  488    sub_atom(Atom, _, _, _, '"'),
  489    !.
  490needs_quotes(Atom, _) :-
  491    sub_atom(Atom, _, _, _, '\n'),
  492    !.
  493needs_quotes(Atom, _) :-
  494    sub_atom(Atom, _, _, _, '\r'),
  495    !.
  496needs_quotes(Atom, Options) :-
  497    csv_options_separator(Options, Sep),
  498    char_code(Char, Sep),
  499    sub_atom(Atom, _, _, _, Char),
  500    !.
  501
  502emit_string([]) --> "".
  503emit_string([0'"|T]) --> !, "\"\"", emit_string(T).
  504emit_string([H|T]) --> [H], emit_string(T).
  505
  506emit_codes([]) --> "".
  507emit_codes([0'"|T]) --> !, "\"\"", emit_codes(T).
  508emit_codes([H|T]) --> [H], emit_codes(T).

csv_write_stream(+Stream, +Data, +Options) is det

Write the rows in Data to Stream. This is similar to csv_write_file/3, but can deal with data that is produced incrementally. The example below saves all answers from the predicate data/3 to File.

save_data(File) :-
   setup_call_cleanup(
       open(File, write, Out),
       forall(data(C1,C2,C3),
              csv_write_stream(Out, [row(C1,C2,C3)], [])),
       close(Out)),

  527csv_write_stream(Stream, Data, Options) :-
  528    must_be(list, Data),
  529    make_csv_options(Options, Record, _),
  530    phrase(emit_csv(Data, Record), String),
  531    format(Stream, '~s', [String])