#!/usr/bin/perl

=head1 NAME

parsepica - parse PICA+ data and print summary information

=cut

use strict;

our $VERSION = '0.22';

# include PICA packages
use PICA::Record;
use PICA::Field;
use PICA::Parser;
use PICA::Writer;
use PICA::Source;

# include other packages
use Getopt::Long;
use Pod::Usage;

my ($outfilename, $logfile, $inputlistfile, $verbose);
my ($quiet, $help, $man, $select, $xmlmode, $loosemode, $countmode, $pretty);
my ($unapimode);
my ($limit, $offset);
my %fieldstat_a; # all
my %fieldstat_e; # exist?
my %fieldstat_r; # number of records

# TODO read settings from config file (for instance SRU)

GetOptions(
    # TODO: add config file
    "output:s" => \$outfilename,   # print valid records to a file
    "log:s" => \$logfile,          # print messages to a file
    "files:s" => \$inputlistfile,  # read names of input files from a file
    "pretty" => \$pretty,
    "quiet" => \$quiet,            # suppress status messages
    "help|?" => \$help,            # show help message
    "man" => \$man,                # full documentation
    "select=s" => \$select,        # select a special field/subfield
    "count" => \$countmode,
    "limit=i" => \$limit,
    "unapi" => \$unapimode,
    "verbose" => \$verbose,
    #"loose" => \$loosemode,        # loose parsing
    "xml" => \$xmlmode
) or pod2usage(2);
pod2usage(1) if $help;
pod2usage(-verbose => 2) if $man;

# Logfile
if ($logfile and $logfile ne "-") {
    open LOG, ">$logfile" 
        or die("Error opening $logfile\n");
} elsif( not $quiet and ($logfile eq "-" or $verbose) ) {
    *LOG = *STDOUT;
} else {
    open LOG, '>/dev/null';

}

# Output writer
$outfilename = "-" unless defined $outfilename;
print LOG "Output to $outfilename\n" if $outfilename ne "-";
my @p = ($outfilename ne "-" ? $outfilename : \*STDOUT);
push @p, ('format' => 'XML') if $xmlmode;
my $writer = PICA::Writer->new( @p, pretty => $pretty );


# init input file list if specified
if ($inputlistfile) { 
    if ($inputlistfile eq "-") {
        *INFILES = *STDIN;
    } else {
        print LOG "Reading input files from $inputlistfile\n";
        open INFILES, $inputlistfile or die("Error opening $inputlistfile");
    }
}

# handlers
my $_field_handler = \&field_handler;
my $_record_handler = \&record_handler;

# select mode
my $field_regex;
my $subfield_select = "";

if ($select) {
    my ($tag, $subfield) = ("","");

    if ( $select =~ /^...+[\$_]/ ) {
        ($tag, $subfield) = split(/[\$_]/,$select);
    } else {
        $tag = $select;
    }

    $field_regex = qr/^$tag$/;
    $subfield_select = $subfield if $subfield ne "";

    $_field_handler = \&select_field_handler;
    undef $_record_handler;

    if ($subfield_select ne "") {
        print LOG "Selecting subfield: $select\n";
    } else {
        print LOG "Selecting field: $select\n";
    }
}

my $remote_counter = 0;

my %options;
$options{Limit} = $limit if defined $limit;
$options{Proceed} = 1;

# init parser
my $parser = PICA::Parser->new(
    Field => $_field_handler,
    Record => $_record_handler,
    %options
);

# parse files given at the command line, in the input file list or STDIN
my $filename;
if (@ARGV > 0) {
    if ($inputlistfile) {
        print STDERR "You can only specify either an input file or a file list!\n";
        exit 0;
    }
    while (($filename = shift @ARGV)) {
        my ($baseurl, $z3950host);
        if ($filename =~ /^http:\/\//) { # SRU or unAPI (http://...)
            $baseurl = $filename;
        } elsif ($filename =~ /^[^\\:]+:\d+/) { # Z3950 (host:port[/db])
            $z3950host = $filename;
        }
        if ($baseurl or $z3950host) {
            my $query = shift @ARGV;
            if (!$query) {
                print STDERR "query missing!\n";
            } else {
                my $remote_parser;
                if ($baseurl && $query =~ /=/) {
                    print LOG "SRU query '$query' to $baseurl\n";
                    my $server = PICA::Source->new( SRU => $baseurl );
                    $remote_parser = $server->cqlQuery( $query,
                        # TODO: better pipe this to another parser (RecordParser)
                        Field => $_field_handler,
                        Record => $_record_handler
                    );
                } elsif ($baseurl) {
                    my $r;
                    if ($unapimode) {
                        print LOG "unAPI query '$query' from $baseurl\n";
                        my $source = PICA::Source->new( unAPI => $baseurl ); # TODO: document this
                        $r = $source->getPPN($query,"gvk"); # TODO: prefix is bad unAPI usage
                    } else {
                        print LOG "PSI get PPN '$query' from $baseurl\n";
                        my $source = PICA::Source->new( PSI => $baseurl ); # TODO: document this
                        $r = $source->getPPN($query);
                    }
                    $parser->parsedata( $r ) if $r;
                } else {
                    print LOG "Z3950 query '$query' to $z3950host\n";
                    my $server = PICA::Source->new( Z3950 => $z3950host );
                    $remote_parser = $server->z3950Query( $query,
                        # TODO: better pipe this to another parser (RecordParser)
                        Field => $_field_handler,
                        Record => $_record_handler
                    );
                }
                $remote_counter += $remote_parser->counter() if defined $remote_parser;
            }
        } else {
            print LOG "Reading $filename\n";
            $parser->parsefile($filename);
        }
    }
} elsif ($inputlistfile) {
    while(<INFILES>) {
        chomp;
        next if $_ eq "";
        $filename = $_;
        print LOG "Reading $filename\n";
        my ($record) = PICA::Parser->parsefile( $filename, Limit => 1)->records;
    }
} else {
    print LOG "Reading standard input\n";
    $parser->parsefile( \*STDIN ); 
}

# Finish
$writer->end();

# Print summary
# TODO: Input fields: ...
print LOG "Input records:\t" . ($parser->counter() + $remote_counter) .
      "\nOutput records:\t" . $writer->counter() .
      "\nOutput fields:\t" . $writer->fields() .
      "\n";

if ($countmode) {
    print "Frequency of tags in all records:\n";
    foreach my $tag (sort keys %fieldstat_a) {
        print "$tag\t" . $fieldstat_a{$tag} . "\t";
        print $fieldstat_r{$tag};
        print "\n";
    }
}


#### handler methods ####

# default field handler
sub field_handler {
    my $field = shift;

    if ($countmode) {
        my $tag = $field->tag;
        if (defined $fieldstat_a{$tag}) {
            $fieldstat_a{$tag}++;
        } else {
            $fieldstat_a{$tag} = 1;
        }
        $fieldstat_e{$tag} = 1;
    }

    return $field;
}

# flushing field handler
#sub flush_field_handler {
#    my $field = shift;
#    $writer->writefield( $field );
#}

# selecting field handler
sub select_field_handler {
    # TODO: Combine with count/default handler

    my $field = shift;
    return unless $field->tag() =~ $field_regex;

     if ($subfield_select ne "") {
        my @sf = $field->subfield( $subfield_select );
        # TODO: print subfield if output format is XML (?)
        print { $writer->{io} } join("\n",@sf) . "\n" if @sf;
    } else {
        $writer->write($field);
    }

    return undef;
}

# default record handler (TODO: directly use a PICA::Writer object)
sub record_handler {
    my $record = shift;
    $writer->write( $record ); 

    if ($countmode) {
        foreach my $tag (keys %fieldstat_e) {
            if (defined $fieldstat_r{$tag}) {
                $fieldstat_r{$tag}++;
            } else {
                $fieldstat_r{$tag} = 1;
            }
        }
        %fieldstat_e = ();
    }

    if ($verbose) {
        print LOG $parser->counter() ."\n" unless ($parser->counter() % 100);
    }
}


=head1 SYNOPSIS

parsepica [options] [file(s) or SRU-Server(s) and queries(s)..]

=head1 OPTIONS

 -help          brief help message
 -man           full documentation with examples
 -log FILE      print logging to a given file ('-': STDOUT, default)
 -input FILE    file with input files on each line ('-': STDIN)
 -output FILE   print all valid records to a given file ('-': STDOUT)
 -xml           output of records in XML
 -pretty        pretty output (useful for PICA XML)
 -quiet         supress logging
 -select FIELD  select a specific field or subfield (no XML output possible yet)

Not fully implemented yet:
 -sru SRU       fetch records via SRU. command line arguments are cql
                statements instead of files
 -z3950         fetch records via Z39.50

=head1 DESCRIPTION

This script demonstrates how to use the Perl PICA module. It can be used 
to check and count records. Input files can be specified as arguments or
from an input file list. Compressed files (C<.gz>) can directly be read.
If no input file is specified then input is read from STDIN.

Logging information is printed to STDOUT (unless quiet mode is set) or to 
a specified logfile. Read records can be written back to a given file or 
to STDOUT ('-') . Records that cannot be parseded produce error messages 
to STDERR.

Selecting fields with parsepica is around half as fast as using 
grep, but grep does not really parse and check for wellformedness.

=head1 EXAMPLES

=over 4

=item parsepica picadata -o checkedrecords

Read records from 'picadata' and print parseable records to 'checkedrecords'.

=item parsepica picadata -s 021A -o - -q

Select all fields '021A' from 'picadata' and write to STDOUT.

=item parsepica http://gso.gbv.de/sru/DB=2.1/ pica.isb=3-423-31039-1

Get records with ISBN 3-423-31039-1 via SRU.

=back

=head1 TODO

Error handling needs to be implemented to collect broken records.

Examples to implement:

parsepica -b errors picadata

Parse records in C<picadata>. The number of records will be reported.

parsepica -out checked -quiet picadata.gz

Parse records in C<picadata.gz>. Print records that are wellformed 
to C<checked> and the other records to C<errors>. Supress any messages. 

=head1 AUTHOR

Jakob Voss C<< jakob.voss@gbv.de >>
