#!/usr/bin/perl -w
#
# stats -- statistics calculation program for parp
#
# Copyright (c) 2000 Adam Spiers <adam@spiers.net>. All rights
# reserved. This program is free software; you can redistribute it and/or
# modify it under the same terms as Perl itself.
#
# $Id: stats,v 1.14 2001/09/13 15:32:12 adam Exp $
#

use strict;

use Time::Local;

use lib "$ENV{HOME}/.parp";
use MyFilter qw(%CONFIG);

my $log_file;

my $program = $0;
$program =~ s!.*/!!;
die "Usage: $program [ log file ]\n" if @ARGV > 1;

if (@ARGV) {
  $log_file = $ARGV[0];
}
else {
  die "Couldn't find log file in config file.\n" unless $CONFIG{log_file};
  $log_file = $CONFIG{log_file};
}

my %totals   = ();
my %dates    = ();
my %messages = ();
my %tests    = ();

read_log_file($log_file);
output_stats($log_file);

exit 0;

##############################################################################

sub read_log_file {
  my ($log_file) = @_;

  open(LOGS, $log_file) or
    die "Couldn't open log file $log_file: $!\n";

  my $id = '';
  my %date = ();
  my $class = 'unknown';

  while (<LOGS>) {
    if (! $id) {
      next unless /^Parp-ID: (.*)/;
      $id = $1;
      my ($time, $md5) = $id =~ m,(\d+)/(.*),;
      @date{qw/sec min hour mday mon year wday yday isdst/} = localtime($time);
      $dates{earliest} = $time
        if ! $dates{earliest} or $time < $dates{earliest};
      $dates{latest}   = $time
        if ! $dates{latest}   or $time > $dates{latest};
      $totals{year}{$date{year}+1900}++;
      $totals{month}{$date{mon}}++;
      $totals{mday}{$date{mday}}++;
      $totals{wday}{$date{wday}}++;       # counts mails by wday
      my $day_time = timelocal(0, 0, 12, $date{mday}, $date{mon}, $date{year});
      $totals{wday2}{$date{wday}}{$day_time}++; # counts number of each wday
      $totals{date}{$day_time}++;
      next;
    }

    if (/^--------------------/) {
      $messages{$id}{class} ||= $class;
      $totals{class}{$class}++;
      $id = '';
      %date = ();
      $class = 'unknown';
      next;
    }

    if (/^(Accepted|REJECTED): (.*?)(?: \((.*)\))?\s*$/) {
      my ($action, $test, $details) = ($1, $2, $3);
      $totals{test}{$test}++;
      $totals{'tested_' . (($action eq 'Accepted') ? 'ok' : 'spam')}{$test}++;
      $tests{$test}{$details || '__undef__' }++;
      $class = 'not_spam' if $action eq 'Accepted';
      next;
    }

    if (/^Delivered to (.*)\s*$/) {
      my $delivered_to = $1;
      $totals{delivered_to}{$delivered_to}++;
      $class = 'not_spam';
      $class = 'spam' if $delivered_to =~ /junk|spam/i;
      next;
    }

    if (/^(Would complain|Complained)/) {
      $messages{$id}{complaint}++;
      $totals{complaints}++;
      next;
    }

    if (/^Reclassification: (.*)/) {
      my $reclassification = $1;
      if ($reclassification eq 'was incorrectly identified as spam') {
        $messages{$id}{real_class} = 'not_spam';
      }
      elsif ($reclassification eq 'was incorrectly identified as bona-fide') {
        $messages{$id}{real_class} = 'spam';
      }
      next;
    }
  }

  close(LOGS);

  $totals{days} = keys %{ $totals{date} };
}

##

sub output_stats {
  my ($log_file) = @_;

  print "Statistics for parp log file $log_file\n";
  print '=' x 78, "\n\n";

  my $left_col = "%17s";
  my $template = "$left_col %s\n";
  printf "$left_col %d bytes\n", 'Size of log file:', (stat $log_file)[7];

  unless (scalar keys %messages) {
    print "No e-mails found.\n";
    return;
  }

  printf $template, 'Earliest entry:', scalar localtime($dates{earliest})
    if $dates{earliest};
  printf $template, 'Latest entry:', scalar localtime($dates{latest})
    if $dates{latest};
  printf $template, 'Total mails:', scalar keys %messages;
  printf $template, 'Number of days:', $totals{days};
  print "\n";

  output_class_stats();
  output_test_stats();
  output_date_stats();
}

sub output_class_stats {
  $totals{false}{negatives}       = 0;
  $totals{false}{positives}       = 0;
  $totals{class}{spam}          ||= 0;
  $totals{class}{not_spam}      ||= 0;
  $totals{real_class}{spam}       = 0;
  $totals{real_class}{not_spam}   = 0;
  $totals{complaints}           ||= 0;
  $totals{false_complaints}     ||= 0;
  $totals{missing_complaints}   ||= 0;

  foreach my $id (keys %messages) {
    my $class = $messages{$id}{class};
    if ($messages{$id}{real_class}) {
      my $real_class = $messages{$id}{real_class};
      $totals{real_class}{$real_class}++;
      if ($class eq 'not_spam' and $real_class eq 'spam') {
        $totals{missing_complaints}++ unless $messages{$id}{complaint};
        $totals{false}{negatives}++; # not a disaster
      }
      elsif ($class eq 'spam' and $real_class eq 'not_spam') {
        $totals{false_complaints}++ if $messages{$id}{complaint};
        $totals{false}{positives}++; # oops!
      }
    }
    else {
      $totals{real_class}{$class}++;
    }
  }

  my $wrong_class   = $totals{false}{positives} + $totals{false}{negatives};
  my $total_mails   = keys %messages;
  my $correct_class = $total_mails - $wrong_class;
  my $accuracy = sprintf "%.4f%%", $correct_class / $total_mails * 100;
  my $every = $wrong_class ?
                (sprintf "%.0f", $total_mails / $wrong_class)
              : '\infinity';

  header('Classification success', 0);
  print <<EOF;
Accuracy: $accuracy (1 failure in every $every)

  [Spam]

    Mails classified as spam:           $totals{class}{spam}
    Actual number of spam mails:        $totals{real_class}{spam}
    Number of false negatives:          $totals{false}{negatives}

  [Bona-fide]

    Mails classified as bona-fide:      $totals{class}{not_spam}
    Actual number of bona-fide mails:   $totals{real_class}{not_spam}
    Number of false positives:          $totals{false}{positives}

  [Complaints]

    Number of complaints made:          $totals{complaints}
    Number of complaints made in error: $totals{false_complaints}
    Number of complaints not made:      $totals{missing_complaints}
EOF

  header('Delivery destinations by frequency');
  
  foreach my $dest (sort { $totals{delivered_to}{$b}
                             <=>
                           $totals{delivered_to}{$a} }
                         keys %{$totals{delivered_to}}) {
    printf "%5d  %s\n", $totals{delivered_to}{$dest}, $dest;
  }
                    
}

sub output_test_stats {
  header('Tests by frequency');

  foreach my $test (sort { $totals{test}{$b} <=> $totals{test}{$a} }
                         keys %{$totals{test}} ) {
    printf "%5d  %s\n", $totals{test}{$test}, $test;
  }

  print "\nSpam:\n";

  foreach my $test (sort { $totals{tested_spam}{$b} <=> $totals{tested_spam}{$a} }
                         keys %{$totals{tested_spam}} ) {
    printf "%5d  %s\n", $totals{tested_spam}{$test}, $test;
  }

  print "\nOK:\n";

  foreach my $test (sort { $totals{tested_ok}{$b} <=> $totals{tested_ok}{$a} }
                         keys %{$totals{tested_ok}} ) {
    printf "%5d  %s\n", $totals{tested_ok}{$test}, $test;
  }

  header('Lexical breakdown of tests');
  print "All:\n";

  foreach my $test (sort keys %{$totals{test}} ) {
    printf "%5d  %s\n", $totals{test}{$test}, $test;
    foreach my $details (sort { $tests{$test}{$b} <=> $tests{$test}{$a} }
                              keys %{$tests{$test}}) {
      printf "      %5d  %s\n", $tests{$test}{$details}, $details;
    }
    print "\n";
  }

  print "\nSpam:\n";

  foreach my $test (sort keys %{$totals{tested_spam}} ) {
    printf "%5d  %s\n", $totals{tested_spam}{$test}, $test;
    foreach my $details (sort { $tests{$test}{$b} <=> $tests{$test}{$a} }
                              keys %{$tests{$test}}) {
      printf "      %5d  %s\n", $tests{$test}{$details}, $details;
    }
    print "\n";
  }

  print "\nOK:\n";

  foreach my $test (sort keys %{$totals{tested_ok}} ) {
    printf "%5d  %s\n", $totals{tested_ok}{$test}, $test;
    foreach my $details (sort { $tests{$test}{$b} <=> $tests{$test}{$a} }
                              keys %{$tests{$test}}) {
      printf "      %5d  %s\n", $tests{$test}{$details}, $details;
    }
    print "\n";
  }
}

sub by_time_prefix {
  my ($c, $d) = ($a, $b);
  $c =~ s,!.*,,;
  $d =~ s,!.*,,;
  return $c <=> $d;
}

sub output_date_stats {
  header('Mails by day of week, most frequent first');
  
  my @days_of_week =
    qw/Sunday Monday Tuesday Wednesday Thursday Friday Saturday/;
  
  foreach my $wday (sort { $totals{wday}{$b} <=> $totals{wday}{$a} }
                         keys %{$totals{wday}}) {
    my $wday_freq = keys %{ $totals{wday2}{$wday} };
    my $average = $totals{wday}{$wday} / $wday_freq;
    printf "%5d in %3d %-10s (average %d)\n",
             $totals{wday}{$wday},
             $wday_freq,
             $days_of_week[$wday] .
               ($wday_freq == 1 ? '' : 's'),
             $average;
  }

  my $average = keys(%messages) / $totals{days};
  printf "\nAverage mails per day: %d\n", $average;

  header('Mails by date, most frequent first');
  
  foreach my $day_time (sort { $totals{date}{$b} <=> $totals{date}{$a} }
                             keys %{$totals{date}})
  {
    printf "%5d  %s\n", $totals{date}{$day_time}, scalar(localtime $day_time);
  }
                    
  header('Mails by date, earliest first');
  
  foreach my $day_time (sort by_time_prefix keys %{$totals{date}}) {
    printf "%5d  %s\n", $totals{date}{$day_time}, scalar(localtime $day_time);
  }
                    
  header('Mails by day of month, most frequent first');
  
  foreach my $mday (sort { $totals{mday}{$b} <=> $totals{mday}{$a} }
                         keys %{$totals{mday}})
  {
    printf "%5d  %d\n", $totals{mday}{$mday}, $mday;
  }
                    
  header('Mails by day of month, earliest first');
  
  foreach my $mday (sort { $a <=> $b } keys %{$totals{mday}}) {
    printf "%5d  %d\n", $totals{mday}{$mday}, $mday;
  }
                    
  header('Mails by month');
  
  foreach my $month (sort { $a <=> $b } keys %{$totals{month}}) {
    printf "%5d  %s\n",
             $totals{month}{$month},
             (qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/)[$month];
  }

  header('Mails by year');
  
  foreach my $year (sort { $b <=> $a } keys %{$totals{year}}) {
    printf "%5d  %d\n", $totals{year}{$year}, $year;
  }
}

sub header {
  my ($header, $pre_lines) = @_;

  $pre_lines = 2 unless defined $pre_lines;
  for my $i (1 .. $pre_lines) {
    print "\f" if $i == $pre_lines;
    print "\n";
  }
  print "$header:\n", ('-' x length $header), "\n\n";
}
