#!/usr/bin/perl -w

use strict;
use Time::Local;
use Getopt::Long;
use Data::Dumper;

my $version = "version 2009-06-06";
my $copyright = "copyright (c) 2009 by Bob Ewart bob-ewart\@earthlink.net";
my $license = "licensed under GPL v2";
my $help = 0;
my $show_version = 0;
my $outfname;
my $infname;
my $blankfname = 'DBTblanks.csv';
my $invfname = 'DBTinvalid.csv';
my $debug = 5;

GetOptions("help!"      => \$help,
           "version!"   => \$show_version,
           "blanks=s"   => \$blankfname,
           "invalid=s"  => \$invfname,
           );

if ($show_version) { die "$version\n$copyright\n$license\n"; }
my $narg = @ARGV;
if ($narg) {
  $infname = $ARGV[0];
} else {
  $help = 1;
}

if ($help) {
  die "\nDBTextCheck infile\nScan DBText database files for field names counts and lengths\n\n".
      "Options:\n".
      "   --help      This text\n".
      "   --version   The version $version\n".
      "   --blanks=   File name for blank records (default $blankfname)\n".
      "   --invalid=  File name for records missing required fields (default $invfname)\n".
      "\n";
}

open(INFILE, '<',$infname) or die "Can't open $infname for input\n$!\n";

$/ = chr(03);           # Record separator as selected for DBText file export
my $sep = chr(04);         # Field separator

my (@headers, @fields, %field_no, @stats);
my ($field, $i, $n, $record, $nfields);
my ($class, $type, $atype);
my (%record_type,%actual_type);
my $type_field = 0;
my %class_type;
my $class_field = 0;
my $books_out = 0;
my $due_field =  0;
my $perm_field = 0;
my $perm_count = 0;
my $blanks =  0;
my $invalid = 0;
my $records = 0;
my @check_blank = ();
my @must_have = ();
my @error_headers = ();


$record = <INFILE>;
chomp($record);
@headers = split /$sep/,$record;
my $nheaders = @headers;

for ($i = 0; $i < @headers; $i++) {
  $field_no{$headers[$i]} = $i;              # Gives us the ability to select field by name
  $stats[$i] = {                             # collect statistics on each field
                 count     => 0,   # number of non-blank occurances
                 maxlength => 0,   # 
                 crlf      => 0,   #  cr/lf count for this field
                 subfields => 0,   # sub fields
               }; 
}

#=================================================================
# Modify the following fields for your records
#=================================================================

my @catalog_blank = ('Title','Subtitle','Author','Corporate Author');
my @catalog_required = ();
my @catalog_error = ('ID','Class','Bar Code','Record Type','Title','Author','Record Created');

my @borrower_blank = ('First Name','Last Name','Company');
my @borrower_required = ('City','State','Zip');
my @borrower_error = ('Borrower Number','Last Name','First Name','City','State','Zip');

my @loan_blank = ('Item Loaned');
my @loan_required = ('Borrower','Item Loaned');
my @loan_error = ('Loan Number','Borrower','Item Loaned','Loan Due Date');

my %convert_type = ('BOOK'     => 'BOOK',
                    'DVD'      => 'DVD',
                    'EROTIC'   => 'ARCHIVE',
                    'F-A'      => 'ARCHIVE',
                    'F-IA'     => 'ARCHIVE',
                    'KIDS'     => 'BOOK',
                    'PAMPHLET' => 'PAMPHLET',
                    'PAMPNLET' => 'PAMPHLET',
                    'REC'      => 'AUDIO',
                    'VID'      => 'VIDEO',
                    'VIDEOTAPE'=> 'VIDEO',
                    'ZPR'      => 'BOOK',
                    'XSERIAL'  => 'OFFSITE',
                    'AUDIOBOOK'=> 'AUDIO',
                    'CD'       => 'CD',
                    );
#========================================================
# End of area to be modified
#========================================================



if ($headers[0] eq 'ID') {                    # Catalog Record
  # fields to be checked for blank records
  foreach $field (@catalog_blank) {
    push @check_blank, $field_no{$field};
  }
  #required fields
  foreach $field (@catalog_required) {
    push @must_have, $field_no{$field};
  }
  $type_field = $field_no{'Record Type'};
  $class_field = $field_no{'Class'};
  @error_headers = @catalog_error;
    
} elsif ($headers[0] eq 'Borrower Number') {   # Borrower Record
  # fields to be checked for blank records
  foreach $field (@borrower_blank) {
    push @check_blank, $field_no{$field};
  }
  # required fields
  foreach $field (@borrower_required) {
    push @must_have, $field_no{$field};
  }
  @error_headers = @borrower_error;                      # Headers for invalid records
    
  
} elsif ($headers[0] eq 'Loan Number') {       # Loan Record
  # fields to be checked for blank records
  foreach $field (@loan_blank) {
    push @check_blank, $field_no{$field};
  }
  # required fields
  foreach $field (@loan_required) {
    push @must_have, $field_no{$field};
  }
  @error_headers = @loan_error;                      # Headers for invalid records
  $due_field = $field_no{'Loan Returned Date'};
  $perm_field = $field_no{'Permanent Loan'};
  
} else {
  warn "'$headers[0]' is not recognized\n";
}


if ($blankfname) {
  open(BLANKS, '>',$blankfname) or die "Can't open blank file as $blankfname for output\n$!\n";
  print BLANKS join(', ',@headers)."\n";
}
if ($invfname) {
  open(INV, '>',$invfname) or die "Can't open invalid file as $invfname for output\n$!\n";
  print INV join(',',@error_headers)."\n";
}

while ($record = <INFILE>) {
  chomp($record);
  @fields = split /$sep/,$record;
  $records++;
  $nfields = @fields;
  if ($nfields != $nheaders) {
    if ($nfields > $nheaders) { 
      $nfields = $nheaders;       # don't process extra fields
    }
  }
  for ($i = 0; $i < $nfields; $i++) {
    if ($fields[$i] !~ m/^\s*$/) {      #non-blank field
      $stats[$i]->{'count'}++;
      $n = 0;
      $n++ while $fields[$i] =~ s/\r\n/\|/g;
      $stats[$i]->{'crlf'} += $n;
      $fields[$i] =~ s/\|\s*\|/\|/g;     # get rid of blank sub-fields
      $fields[$i] =~ s/^\s*\"\s*//;         # get rid of enclosing double quotes 
      $fields[$i] =~ s/\s*\"\s*$//;
      $fields[$i] =~ s/^\s*\'\s*//;         # get rid of enclosing single quotes
      $fields[$i] =~ s/\s*\'\s*$//;
      $fields[$i] =~ s/^\s*\|//;         # leading or
      $fields[$i] =~ s/\|\s*$//;         # trailing |
      if (length($fields[$i]) > $stats[$i]->{'maxlength'}) {
        $stats[$i]->{'maxlength'} = length($fields[$i]);
      }
      $n = 0;
      $n++ while $fields[$i] =~ m/\|/g;
      $stats[$i]->{'subfields'} += $n;
    }
  }
  $n = 0;
  foreach $field (@check_blank) {       # check for blank records
    if (($field < $nfields) && ($fields[$field] !~ m/^\s*$/)) { $n++;}
  }
  if ($n eq 0) {
    $blanks++;
    if ($blankfname) {
      print BLANKS '"'.join('", "',@fields)."\"\n";
    }
  } else {                            # check for must have fields
    $n = 0;
    foreach $field (@must_have) {
      if (($field < $nfields) && ($fields[$field] =~ m/^\s*$/)) { $n++;}
    }
    if ($n) {
      print_error();
    } else {
      if ($type_field) {             # summarize catalog types
#        if ($debug && ($fields[$field_no{'Place'}] =~ m/\|/)) {
        if ($debug && ($records % 1000) == $debug) {
          print_record();
          $debug--;
        }
        $type = $fields[$type_field];
        $class = $fields[$class_field];
        if (!$type) {
          # Record Type is missing
          if (!$class) {
            # both class and type are missing
            $class = 'missing';
            print_error();
          } else {
            # Can we determine record type from the class
            if ($class =~ m/([a-z\-]+)/i) {
              $class = uc($1);
              if (length($class) < 3) {  # must be a LoC code
                $type = 'BOOK';
                $class = '';
              } else {
                if (exists $convert_type{$class}) {
                  $type = $convert_type{$class};
                  $class = '';
                } else {
                  print_error();
                }
              }
            }
          }
        } else {
          $type =~ m/([a-z]+)/i;
          $type = uc($1);
          $class = '';
        }
        if (!$type) {
          $type = 'missing';
        }
        if (exists $convert_type{$type}) {
          $type = $convert_type{$type};
        }
        $record_type{$type}++;
        if ($class) {
          $class_type{$class}++;
        }
      if ($fields[$type_field]) {
        $atype = $fields[$type_field];
      } else {
        $atype = 'missing';
      }
      if (exists $actual_type{$type}->{$atype}) {
          $actual_type{$type}->{$atype}++;
        } else {
          $actual_type{$type}->{$atype}= 1;
#          if ($debug--) {
#            print "\n$type -> $fields[$type_field]\n";
#            print Dumper(%actual_type);
#          } 
        }
      }
      if ($due_field) {              # count books checked out
        if (!$fields[$due_field] || ($fields[$due_field] =~ m/^\s*$/)) {
          $books_out++;
        }
        if ($fields[$perm_field] && ($fields[$perm_field] =~ m/y/i)) {
          $perm_count++;              # permanent loan count
        }
      }
    }
  }
#  ($records < 3) || last;
}

#========================================
# End of main loop
#----------------------------------------
# Print Statistics
#========================================

print "\nField Present Length CrLf xField Description\n";

for ($i = 0; $i < $nheaders; $i++) {
  printf( "%5d %6.1f%% ",$i,(100.0 * $stats[$i]->{'count'})/$records); 
  printf("%5d %5d %6d %s\n",$stats[$i]->{'maxlength'},$stats[$i]->{'crlf'},
           $stats[$i]->{'subfields'},$headers[$i]);
}

if ($type_field) {
  print "\nCount Record Type\n";
  foreach $field (sort keys %record_type) {
    printf("%5d %s\n",$record_type{$field},$field);
    foreach $type (sort keys %{$actual_type{$field}}) {
      printf("      %5d %s\n",$actual_type{$field}->{$type},$type);
    }
  }
  print "\nCount Class for missing record type\n";
  foreach $field (sort keys %class_type) {
    if (length($field) > 2) {
      printf("%5d %s\n",$class_type{$field},$field);
    }
  }
}
if ($due_field) {
  printf("\n%5d items still out\n",$books_out);
  printf("%5d items on permanent loan\n\n",$perm_count);
}

printf("\n%5d records\n%5d blank records\n%5d invalid records\n\n",$records,$blanks,$invalid);

#==================================
# End of main line
#----------------------------------
# Subroutines
#==================================

sub print_error {
  my (@error_fields,$field_name, $field_value);
  @error_fields = ();
  foreach $field_name (@error_headers) {
    if (exists $field_no{$field_name}) {
      $field_value = $fields[$field_no{$field_name}];
    } else {
      $field_value = 'Missing-'.$field_name;
      
      ($records < 1000) && print_record();
    }
    if (!$field_value) { $field_value = 'missing';}
    if ($field_value =~ m/,/) {
      $field_value = '"'.$field_value.'"';
    }
    push @error_fields, $field_value;
  }
  print INV join(',',@error_fields)."\n";
  $invalid++;
}

sub print_record {
  print "\nRecord $records\n";
  for ($i = 0; $i < @headers; $i++) {
        print $headers[$i];
    if ($fields[$i]) {
      print "=>$fields[$i].\n";
    } else {
      print ".\n";
    }
  }
}

