package Mail::Filterable;

##############################################################################
#
# Routines for parsing the e-mail being filtered and calculating various
# bits of data which will be used in the filtering process.
#

use strict;
use warnings;

use Data::Dumper;
use Digest::MD5 qw(md5_base64);
use Mail::Address;
use Mail::Field;
use Time::Local;

use Parp::Config qw(config);
use Parp::Folders qw(append_to_folder folder_substs);
use Parp::Friends qw(make_friend have_friends is_friend);
use Parp::IdCache;
use Parp::Options qw(opt);
use Parp::Utils qw(vprint log_to_file check_file_dir month2i error fatal);
use Parp::Blacklist qw(blacklist_lookup);

sub new {
  my ($this, $mail, $props_hashref) = @_;
  my $class = ref($this) || $this;
  my $self = bless {}, $class;
  my $m = $self; # syntactic sugar

  my $header = $mail->head();
  $$m{content_type} = $header->get('Content-Type') || '';
  $$m{mail}   = $mail;
  $$m{header} = $header;
  $$m{body}   = $self->_get_body($mail->body());
  $$m{body_scalar} = join '', map { s/^>From/From/; $_ } @{$$m{body}};

  # It's easier if all the headers we deal with are single lines.
  # In particular, don't fold `From ' header, as we want that to match
  # a date regexp below.
  $header->unfold();

  # envelope from
  $header->mail_from('KEEP');
  if ($mail->can('fromLine')) {
    # got a Mail::Box::Mbox::Message
    ($$m{mail_from} = $mail->fromLine()) =~ s/^From //;
  }
  else {
    $$m{mail_from}  = $header->get('From ') 
                   || $header->get('Mail-From')      || '';
  }
  $$m{parp_id}      = $header->get('X-Parp-Id')      || '';
  $$m{from}         = $header->get('From')           || '';
  $$m{to}           = $header->get('To')             || '';
  $$m{cc}           = $header->get('Cc')             || '';
  $$m{subject}      = $header->get('Subject')        || '';
  $$m{return_path}  = $header->get('Return-Path')    || '';
  $$m{reply_to}     = $header->get('Reply-To')       || '';
  $$m{list}         = $header->get('X-Mailing-List') || '';
  $$m{sender}       = $header->get('Sender')         
                   || $header->get('X-Sender')       || '';
  $$m{in_reply_to}  = $header->get('In-Reply-To')    || '';
  $$m{references}   = $header->get('References')     || '';
  $$m{id}           = $header->get('Message-ID')     || '';
  $$m{date}         = $header->get('Date')           || '';
  $$m{status}       = $header->get('Status')         || '';
  $$m{a_to}         = $header->get('Apparently-To')  || '';
  $$m{precedence}   = $header->get('Precedence')     || '';
  $$m{organisation} = $header->get('Organisation')  
                   || $header->get('Organization')   || '';
  $$m{mailer}       = $header->get('User-Agent') 
                   || $header->get('X-Mailer')       || '';

  $self->_check_resent_headers();
  $self->_chomp_props();
  $self->_check_envelope();
  $self->_extract_addrs();
  
  $$m{recvds_array}    = [ $header->get('Received') ];
  $$m{recvds}          = join '', @{$$m{recvds_array}};

  $self->_check_auth_sender();

  # Remove all previously existing parp headers except X-Parp-Id
  my @parp_headers = grep /^X-Parp-(?!Id)/, $$m{header}->tags();
  foreach my $parp_header (@parp_headers) {
    $$m{header}->delete($parp_header);
  }

  # Add a header for the process id to try to chase down obscure bugs.
  $$m{header}->add('X-Parp-pid', $$);
#  system("/bin/date >> ~/mail/.parp.pstree");
#  system("pstree >> ~/mail/.parp.pstree");

  # Calculate a unique id which parp can always refer to.  We
  # calculate the MD5 digest of enough of the whole mail to ensure a
  # unique id, but without any bits which might change in some way
  # during the e-mail's life-span, so as to guarantee that during any
  # subsequent reclassification of the e-mail as a false
  # positive/negative (in the spam detection sense) this unique id
  # will match with the original, so that the statistics calculation
  # program will work.
  
  my $immutable_header =
    join '', map "$_: $$m{$_}\n", (qw/mail_from from to cc subject
                                     return_path reply_to in_reply_to
                                     references id date mailer recvds/);
  
  my $immutable = $immutable_header .
                  "\n" .
                  $$m{body_scalar};
  $immutable =~ s/\n+$/\n/;

  my $parp_id = $$m{env_from_time} . "/" . md5_base64($immutable);

  # This one was a PAIN to get right.  I don't feel safe enough yet to
  # remove the debugging.
  my $immutables_dir = "$ENV{HOME}/mail/.immutables";
  if (-d $immutables_dir) {
    my $id_file = $parp_id;
    $id_file =~ s!/!_!g;
    $id_file = "$immutables_dir/$id_file";
    
    while (-e $id_file) {
      # Generate a unique suffix
      $id_file =~ s/(?:\.(\d+))?$/"." . (($1 || 0) + 1)/e;
    }
    
    if (open(FOO, ">$id_file")) {
      print FOO $immutable;
      close(FOO);
    }
    else {
      error("Couldn't open $id_file for writing: $!");
    }
  }
  
  if ($$m{parp_id}) {
    # This e-mail has already been run through parp, so it already has
    # an X-Parp-Id header.

    if ($$m{parp_id} ne $parp_id) {
      # Better check that the id we've just calculated is the same,
      # otherwise our calculation algorithm is in trouble.
      error("Message already had a parp id of: $$m{parp_id}\n" .
            "       but recalculation yielded: $parp_id",
            #"\%m:\n", Dumper(\%m),
           );
    }
  }
  else {
    # This e-mail hasn't been touched by parp before, so stamp it with
    # a parp id.
    $$m{header}->add('X-Parp-Id', $parp_id);
    $$m{parp_id} = $parp_id;
  }

  $$m{backup}   = 1;     # back up by default
  $$m{complain} = 1;     # allow complaining by default

  return $self;
}

sub _get_body {
  my ($m, $body) = @_;

  return $body unless $m->{content_type} =~ m!^multipart/.*boundary=(.*)\n!s;

  # Deal with MIME multipart messages without using a very slow
  # parser from CPAN ...
  my $boundary = $1;
  $boundary =~ s/^"(.*)"$/$1/;
  $boundary = quotemeta $boundary;
  log_to_file qq{Message is multipart; splitting on boundary "$1".\n};
  
  my @parts = split /--$boundary(?:--)?\n?/m, join('', @$body);

  log_to_file "Deleting non-text parts ... \n";
  my @body_lines = ();
  foreach my $part (@parts) {
    my @lines = split /(?<=\n)/, $part;
    my $part_mail = new Mail::Internet(\@lines);
    next unless @lines;
    my $content_type = $part_mail->get('Content-Type');
    if ($content_type) {
      chomp $content_type;
      log_to_file "Content-Type: $content_type";
      push @{ $m->{content_types} }, $content_type;

      if ($content_type !~ m!^text/\b!) {
        log_to_file "; skipping ...\n";
        next;
      } else {
        log_to_file "\n";
      }
    } else {
      $content_type = '_unspecified_';
      log_to_file "Warning: Content-Type was unspecified; assuming plain text.\n";
    }

    push @body_lines, @{ $part_mail->body() };
  }

  return \@body_lines;
}

sub _check_resent_headers {
  my ($self) = @_;

  # From RFC822:
  #
  # --------- 8< --------- 8< --------- 8< --------- 8< --------- 8< ---------
  #    4.2.  FORWARDING
  #  
  #         Some systems permit mail recipients to  forward  a  message,
  #    retaining  the original headers, by adding some new fields.  This
  #    standard supports such a service, through the "Resent-" prefix to
  #    field names.
  #  
  #         Whenever the string "Resent-" begins a field name, the field
  #    has  the  same  semantics as a field whose name does not have the
  #    prefix.  However, the message is assumed to have  been  forwarded
  #    by  an original recipient who attached the "Resent-" field.  This
  #    new field is treated as being more recent  than  the  equivalent,
  #    original  field.   For  example, the "Resent-From", indicates the
  #    person that forwarded the message, whereas the "From" field indi-
  #    cates the original author.
  #  
  #         Use of such precedence  information  depends  upon  partici-
  #    pants'  communication needs.  For example, this standard does not
  #    dictate when a "Resent-From:" address should receive replies,  in
  #    lieu of sending them to the "From:" address.
  #  
  #    Note:  In general, the "Resent-" fields should be treated as con-
  #           taining  a  set  of information that is independent of the
  #           set of original fields.  Information for  one  set  should
  #           not  automatically be taken from the other.  The interpre-
  #           tation of multiple "Resent-" fields, of the same type,  is
  #           undefined.
  # --------- 8< --------- 8< --------- 8< --------- 8< --------- 8< ---------
  #
  # So we only take values from Resent- headers when we can't get them
  # any other way but we really would prefer to have them.

  my %resent_headers = ( id => 'Message-ID' );
  foreach my $header_key (keys %resent_headers) {
    my $header_name = $resent_headers{$header_key};
    $self->{$header_key} ||= $self->{header}->get("Resent-$header_name") || '';
  }
}

sub _chomp_props {
  my ($self) = @_;
  
  foreach my $prop (qw/mail_from parp_id from to cc subject
                       return_path reply_to list
                       in_reply_to references id date status a_to mailer/)
  {
    chomp $self->{$prop} if $self->{$prop};
  }
}

sub _check_envelope {
  my ($self) = @_;

  if (! $self->{mail_from}) {
    error(<<QMAIL);
Envelope From header missing.  If you are using qmail as your MTA,
make sure your .qmail setup passes the mail through the preline filter
before being passed to parp, e.g.

  | preline /path/to/parp -dr

Otherwise, parp may not be compatible with your MTA.  Does it deliver
in mbox format?
QMAIL
  }

  if ($self->{mail_from} !~
      /(.*?)\s*(\w{3}) (\w{3}) ([\d ]\d) (\d\d):(\d\d):(\d\d) (\d{4})$/) {
    fatal(6, "Envelope From header had weird date format:\n$self->{mail_from}",
          "\$m:\n", Dumper($self));
  }
  
  $self->{env_from} = $1;
  my ($dow, $month, $mday, $hour, $min, $sec, $year) =
     ($2, $3, $4, $5, $6, $7, $8);

  $self->{env_from_time} =
    timelocal($sec, $min, $hour, $mday, month2i($month), $year);
}

sub _extract_addrs {
  my ($m) = @_;

  # FIXME: This is inefficient.
  @{$$m{ftc}}            = _extract_addrs_to_array(@$m{qw/env_from from
                                                          to cc list sender/});
  $$m{from_addr}         = _extract_addr_to_scalar($$m{from});
  $$m{env_from_addr}     = _extract_addr_to_scalar($$m{env_from});
  @{$$m{from_addrs}}     = _extract_addrs_to_array(@$m{qw/env_from from/});
  @{$$m{all_from_addrs}} = _extract_addrs_to_array(@$m{qw/env_from from
                                                          reply_to return_path
                                                          sender id/});
}

sub _check_auth_sender {
  my ($self) = @_;

  my @comments        = $self->{header}->get('Comments');
  $self->{comments}        = \@comments;

  $self->{auth_sender}     = '';
  foreach my $comment (@comments) {
    if ($comment =~ /^Authenticated sender is (.*)/i) {
      $self->{auth_sender} = $1;
      last;
    }
  }
}

sub _extract_addrs_to_array {
  my (@lines) = @_;

  my @addrs = ();
  foreach my $line (@lines) {
    my @new_addrs = Mail::Address->parse($line);
    push @addrs, map { $_->address() } @new_addrs;
  }

  return @addrs;
}

sub _extract_addr_to_scalar {
  my ($line) = @_;

  my @addrs = ();
  my @new_addrs = Mail::Address->parse($line);
  push @addrs, map { $_->address() } @new_addrs;

  error("header passed to _extract_addr_to_scalar had " .
        "more than one address\n",
        map("  $_\n", @addrs),
        "line: $line\n",
       )
    if @addrs > 1;

  return @addrs ? $addrs[0] : undef;
}


##############################################################################
#
# Routines for performing tests on the e-mail being filtered, and
# categorizing it accordingly.
#

sub matches {
  my $m = shift;
  my ($category, $re, $debug, $multi) = @_;

  my @matches = ();
  foreach my $addr (@{ $m->{$category} }) {
    print "Testing $addr =~ /$re/\n" if $debug;
    my $matches = $addr =~ $re;
    if ($matches) {
      print "  -- matched!\n" if $debug;
      if ($multi) {
        push @matches, $addr;
      }
      else {
        my @substrs = ();
        for my $i (1 .. @+) { 
          # Perl should have a primitive for this.
          push @substrs, substr($addr, $-[$i], $+[$i]) if defined $-[$i];
        }
        return ($matches, @substrs);
      }
    }
  }

  return $multi ? @matches : ();
}

sub ftc_matches { # TODO
  my $m = shift;
  my @matches = $m->matches('ftc', @_);
  return $matches[0];
}

sub extract_friends {
  my $m = shift;
  my ($folder) = @_;

  # Count which addresses in which headers are mine.
  my %addrs = $m->_classify_addresses();

  my @maybe_new_friends = ();

  if ($addrs{from}{me}  == 1 &&
      $addrs{to}{total} == 1 &&
      $addrs{cc}{total} == 0)
  {
    push @maybe_new_friends, { addr => $addrs{to}{addrs}[0],
                               header => 'to' };
    log_to_file "Found friend in `To:' header.\n";
  }
  elsif ( # $addrs{to}{me} + $addrs{cc}{me} >= 1 && # could be on a list
         $addrs{from}{not_me} == 1)
  {
    push @maybe_new_friends, { addr => $addrs{from}{addrs}[0],
                               header => 'From' };
    push @maybe_new_friends, { addr => $addrs{reply_to}{addrs}[0],
                               header => 'Reply-To' }
      if $addrs{reply_to}{total} == 1;
    log_to_file "Found friend in `From' and `Reply-To:' headers.\n";
  }

  my $folder_name = $folder ? qq.`$folder'. : 'unknown';
  return $m->_make_friends($folder_name, @maybe_new_friends);
}


sub _make_friends {
  my $m = shift;
  my ($folder_name, @new_friends) = @_;

  my $added = 0;
  foreach my $new_friend (@new_friends) {
    my $addr = $new_friend->{addr};
    vprint "Adding `$addr' to friends database ... \n";
    if (my $reason = is_friend($addr)) {
      vprint "$addr is already a friend ($reason)\n";
      next;
    }
    vprint "new\n";
    my $source = "friend extracted from `$new_friend->{header}' " .
                 "header of message";
    if ($m->{parp_id}) {
      $source .= " parp id $m->{parp_id}";
    }
    elsif ($m->{id}) {
      $source .= " id `$m->{id}'";
    }
    elsif ($m->{date}) {
      $source .= " dated $m->{date}";
    }
    $source .= " in $folder_name folder";
    make_friend($addr, $source);
    $added++;
  }
  
  return $added ? q[EXTRACTED_FRIEND] : q[DIDN'T_EXTRACT_FRIEND];
}

sub _classify_addresses {
  my $m = shift;
  
  my %addrs = (
#              env_from    => { descr => 'envelope From' },
               from        => { descr => 'From'          },
               to          => { descr => 'to'            },
               cc          => { descr => 'cc'            },
               reply_to    => { descr => 'Reply-To'      },
#              return_path => { descr => 'Return-Path'   },
              );

  foreach my $addr_type (keys %addrs) {
    $addrs{$addr_type}{$_} ||= 0 foreach qw/me not_me total/;
    next unless $m->{$addr_type};
    my @addrs = Mail::Address->parse($m->{$addr_type});
    foreach my $parsed (@addrs) {
      my $paddr = $parsed->address();
      if ($paddr =~ config->me() || $paddr =~ config->old_me()) {
        $addrs{$addr_type}{me}++;
      } else {
        $addrs{$addr_type}{not_me}++;
      }
      $addrs{$addr_type}{total}++;
      push @{$addrs{$addr_type}{addrs}}, $paddr;
    }
  }

# for my $type (qw/from to cc reply_to/) {
#   vprint "type $type: ";
#   for my $count (qw/me not_me total/) {
#     vprint "[$count $addrs{$type}{$count}]";
#   }
#   vprint "\n";
# }

  return %addrs;
}

sub is_duplicate {
  my $m = shift;

  return 1 if Parp::IdCache::is_duplicate($m->{id});

  Parp::IdCache::add_id($m->{id});
  log_to_file "Added id to duplicates cache.\n";
  return 0;
}

sub was_to_old_addresses {
  my $m = shift;

  my $found = 0;

  if ($m->{to} =~ config->old_me()) {
    log_to_file "*** Old address found:\n  ", $m->{to}, "\n";
    $found++;
  }

  if ($m->{cc} =~ config->old_me()) {
    log_to_file "*** Old address found:\n  ", $m->{cc}, "\n";
    $found++;
  }

  return $found;
}

sub is_passworded {
  my $m = shift;
  my $password = config->password();
  if ($m->{subject} =~ $password or
      ($m->{header}->get(config->password_header()) || '') =~ $password) {
    $m->accept_mail('contains good password');
    make_friend($m->{from_addr}, 'gave password');
    return 1;
  }

  return 0;
}

sub is_from_good_domain {
  my $m = shift;

  if ($m->{from}     =~ config->good_domains() &&
      ($m->{env_from} =~ config->good_domains() ||
#       $m->{id}       =~ config->good_domains() ||
       $m->{sender}   =~ config->good_domains()))
  {
    my $good_domain = $1;
    $m->accept_mail('good domain', $good_domain);
    return 1;
  }

  return 0;
}

sub has_good_headers {
  my $m = shift;

  # Could cross-check In-Reply-To: with good domains, but
  # no spammers seem to be setting this header yet, which
  # makes it an even more powerful test.
  if ($m->{in_reply_to}) {
    $m->accept_mail('had In-Reply-To: header');
    return 1;
  }

  if ($m->{references} =~ config->good_domains()) {
    $m->accept_mail('References: had good domain', $1);
    return 1;
  }

  if ($m->{subject} =~ config->subject_buzzwords()) {
    $m->accept_mail('subject had buzzword', $1);
    return 1;
  }

  if ($m->{mailer} =~ /(mutt)/i) {
    $m->accept_mail('good X-Mailer header', $1);
    return 1;
  }

  return 0;
}

sub is_from_good_person {
  my $m = shift;

  if (have_friends()) {
    foreach my $addr (@$m{qw/from_addr env_from_addr/}) {
      if (my $reason = is_friend($addr)) {
        $m->accept_mail('from friend', "`$addr' -- $reason");
        return 1;
      }
    }
  }

  return 0;
}

sub has_spam_headers {
  my $m = shift;

  # Many thanks to Mark-Jason Dominus and to the authors of junkfilter
  # and the NAGS filter for some of the ideas contained herein.

  my $octet_RE = '([12]?\d\d|\d\d|\d)';
  my $ipv4_RE  = ("$octet_RE\\." x 3) . $octet_RE;
  my $foo_RE   = qr![\w.%\#\$+-/]+\*?!;
  if ($m->{id} !~ m/^<
                      ($foo_RE|"$foo_RE")
                      \@
                      (
                       [\w-]+ (\. [\w-]+){0,6}         |
                       \[ $ipv4_RE \]
                      )
                     >/x) {
    $m->reject_junk_mail('invalid Message-ID: header', "`$m->{id}'");
    return 1;
  }

  if (my @m = $m->matches('ftc', config->decoys())) {
    $m->{complain} = 0; # don't let them wise up to me subscribing to
                        # stuff using a dud address
    $m->reject_junk_mail('not sent to a proper address', $m[1] || undef);
    return 1;
  }

  foreach my $bad_header (qw/PMFLAGS Advertisement X-Advertisement X-Shock/) {
    if ($m->{header}->get($bad_header)) {
      $m->reject_junk_mail('found bad header', $bad_header);
      return 1;
    }
  }

  my $uidl = $m->{header}->get('X-UIDL') || '';
  chomp $uidl;
  if ($uidl and $uidl !~ /^([0-9a-f]{32}|.{20})$/i) {
    $m->reject_junk_mail('invalid X-UIDL: header', "`$uidl'");
    return 1;
  }

  if ($m->{status} =~ /MC/i) {
    $m->reject_junk_mail('MaxAnnon! mailer');
    return 1;
  }

  if (($m->{header}->get('X-Distribution') || '') =~ /mass/i) {
    $m->reject_junk_mail('bulk mail sent with Pegasus');
    return 1;
  }

  if ($m->{from} =~ /^(<(_?\@_)?>)$/) {
    $m->reject_junk_mail("bad From: header", "`$1'");
    return 1;
  }

  if ($m->{return_path} =~ /^(<(_?\@_)?>)$/) {
    $m->reject_junk_mail("bad Return-Path: header", "contained `$1'");
    return 1;
  }

  if ($m->{from} eq '') {
    $m->reject_junk_mail('From: header is blank or missing');
    return 1;
  }

  if (($m->{subject} =~ tr/\x80-\xff//) > 3) {
    $m->reject_junk_mail('Subject: header had too many 8-bit characters');
    return 1;
  }

  if ($m->{date} =~ m![^\w:,()+/ \t-]!) {
    $m->reject_junk_mail('bad Date: header', "`$m->{date}'");
    return 1;
  }

  if ($m->{recvds} =~ /(-0600 \(EST\)|-0[57]00 \(EDT\))/) {
    $m->reject_junk_mail('bad Received: header date', "`$1'");
    return 1;
  }

  if ($m->{mailer} =~ config->bad_words()) {
    $m->reject_junk_mail("bad X-Mailer: header", "contained `$1'");
    return 1;
  }

  if ($m->{mailer} =~ /[0-9a-f]{10}/i) {
    $m->reject_junk_mail("bad X-Mailer: header", "contained nonsense hex");
    return 1;
  }

  if ($m->{recvds} =~ config->bad_words()) {
    $m->reject_junk_mail("bad Received: header", "contained `$1'");
    return 1;
  }

  if ($m->{organisation} =~ config->bad_words()) {
    $m->reject_junk_mail("bad organisation header", "contained `$1'");
    return 1;
  }

  if ($m->{precedence} =~ /(junk)/i) {
    # Precedence: bulk sometimes set by list software :-(
    $m->reject_junk_mail("bad precedence header", "contained `$1'");
    return 1;
  }

  if (@{$m->{ftc}} > config->max_recipients()) {
    $m->reject_junk_mail('too_many_recipients');
    return 1;
  }

  if ($m->{subject} =~ /\s{8}[0-9]+\s*$/) { # 8 spaces seems to be about right
    $m->reject_junk_mail('Subject: header contained trailing spam id');
    return 1;
  }

  if (config->like_me()) {
    my @recipients_like_me = $m->matches('ftc', config->like_me(), 0, 1);
    if (@recipients_like_me > config->max_like_me()) {
      $m->reject_junk_mail('spam sent to a big alphabetical list');
      return 1;
    }
  }

  if ($m->{from_addr} =~ /(\@{2,})/) {
    $m->reject_junk_mail('bad From: address', "contained `$1'");
    return 1;
  }

  return 1 if $m->has_spam_received_headers();
  
  return 0;
}

sub has_spam_received_headers {
  my $m = shift;
  $m->parse_received_headers();
  while (my ($recvd, $tree) = each %{ $m->{recvd_parse_trees} }) {
    return 1 if $m->received_header_is_spam($recvd, $tree);
  }
}

sub received_header_is_spam {
  my ($m, $recvd, $tree) = @_;

  my $HELO        = $tree->{from}{HELO}   || '';
  my $from_domain = $tree->{from}{domain} || '';

  if ($HELO =~ /tmpstr/i) {
    $m->reject_junk_mail('bad HELO', $HELO);
    return 1;
  }

# Some bona-fide MTAs give a single sub-domain as the domain.  Bah.
#   if ($HELO                  &&
#       $HELO !~ /\./          &&
#       $HELO ne 'localhost'   &&
#       $from_domain !~ /^$HELO/)
#   {
#     $m->reject_junk_mail('Received header had invalid from domain',
#                          $HELO);
#     return 1;
#   }

  if ($HELO =~ /hotmail\.com/ && $from_domain !~ /hotmail\.com$/) {
    my $whole = $tree->{from}{whole};
    chomp $whole;
    $m->reject_junk_mail('Received header faked as hotmail', $whole);
    return 1;
  }

  return 0;
}

sub has_suspicious_headers {
  my $m = shift;

  if ($m->{to} eq '' and $m->{cc} eq '') {
    $m->reject_junk_mail('To: and Cc: headers both blank or missing');
    return 1;
  }

  if ($m->{to} eq '') {
    $m->reject_junk_mail('To: header blank or missing');
    return 1;
  }

  if ($m->{to} =~ config->bad_to()) {
    $m->reject_junk_mail("bad To: header", "contained `$1'");
    return 1;
  }

  if ($m->{subject} =~ config->bad_subjects()) {
    $m->reject_junk_mail("bad Subject: header", "contained `$1'");
    return 1;
  }

  if (($m->{subject} =~ tr/!/!/) >= 5 ||
      $m->{subject} =~ /!!!!/) {
    $m->reject_junk_mail("Subject: header contained too many exclamation marks");
    return 1;
  }

  if ((my @words = $m->{subject} =~ /\b[A-Z]+\b/g) >= 6) {
    $m->reject_junk_mail('Subject: header had too many all-caps words');
    return 1;
  }

# This one is a bit extreme ...
#  if ($m->{subject} eq '') {
#    $m->reject_junk_mail('Subject: header is blank or missing');
#    return 1;
#  }

# This one is a bit extreme too ...
#  if ($m->{from} =~ /^(\d+)\@/ ||
#      $m->{from} =~ /^(\d+)\@/)
#  {
#    $m->reject_junk_mail('username is all digits', "`$1'");
#    return 1;
#  }

  return 0;
}

sub has_spam_from_addresses {
  my $m = shift;

  if (my @m = $m->matches('all_from_addrs', config->bad_from())) {
    $m->reject_junk_mail('bad from address', "contained `$m[1]'");
    return 1;
  }

  return 0;
}

sub has_spam_domains_anywhere {
  my $m = shift;

  if (my @m = $m->matches('all_from_addrs', config->bad_origins())) {
    $m->reject_junk_mail('bad from/return address', "`$m[1]'");
    return 1;
  }

  if ($m->{recvds} =~ config->bad_origins()) {
    $m->reject_junk_mail('bad Received: address', "`$1'");
    return 1;
  }

  $m->parse_received_headers();

  return 1 if opt('do_blacklist') && $m->has_blacklisted_IPs();

  return 0;
}

sub has_blacklisted_IPs {
  my $m = shift;
  
  my $debug = 0;
  my %ips = ();

  foreach my $recv (@{$m->{recvds_array}}) {
    # Avoid various false positives
    $recv =~ s/JetMail \d\.\d\.\d\.\d\b//g;
      
    my @ips = $recv =~ m@(?<!/)\b(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\b(?!\.)@g;
    @ips = grep { ! exists $ips{$_} } @ips;
    vprint "blacklist check on $recv ...\n" if $debug && @ips;
    foreach my $ip (@ips) {
      $ips{$ip}++;
      vprint "  Checking IP $ip ... " if $debug;
      my $rbl = blacklist_lookup($ip);
      if ($rbl) {
        my $reason = $rbl eq '1' ? $ip : $rbl;
#       if ($reason =~ /(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})/) {
#         $reason = "http://ordb.org/lookup/?host=$1";
#       }
        vprint ''. ($rbl || 'blacklisted') . "\n" if $debug;
        my @reject = ('Blacklisted');
        push @reject, $reason if $reason;
        $m->reject_junk_mail(@reject);
        return 1;
      } else {
        vprint "not found\n" if $debug;
      }
    }
  }

  return 0;
}

sub parse_received_headers {
  my $m = shift;

  return if $m->{recvd_parses_done};

  my $failed_parses_output = '';

  foreach my $recv (@{$m->{recvds_array}}) {
    $recv =~ s/\s*\n\s*/ /gm;

    my $obj = Mail::Field->new('Received', $recv);
    $obj->debug(5);

    if (! $obj->parsed_ok()) {
      # Output follows in order ...

      # First, preamble before parser errors
      $failed_parses_output .= <<EOF;
--
Error parsing Received: `$recv'

EOF

      # Then, a reminder of the message details
      $failed_parses_output .= <<EOF;
From: $m->{from}
To: $m->{to}
EOF
      $failed_parses_output .= "Cc: $m->{cc}\n" if $m->{cc};

      $failed_parses_output .= <<EOF;
Subject: $m->{subject}
Message-ID: $m->{id}

EOF

      # Finally, the incomplete parse tree
      $failed_parses_output .= Dumper($obj->parse_tree()) . "\n";

      $m->{recvd_parses_failed}++;

      $failed_parses_output .= $obj->diagnostics();
    }

    $m->{recvd_parse_trees}{$recv} = $obj->parse_tree();
#    vprint Dumper($obj->parse_tree()), "\n";
  }

  $m->{recvd_parses_out} = $failed_parses_output;
  $m->{recvd_parses_done} = 1;
}

sub for_me {
  my $m = shift;
  if ($m->{to} =~ config->me() or $m->{cc} =~ config->me()) {
    return 1;
  }
  $m->reject_junk_mail('not addressed to me');
  return 0;
}

sub has_spam_content {
  my $m = shift;

  return 1 if $m->has_bad_content_type();
  return 1 if $m->too_many_forwards();

  # Strip blank and quoted lines to get original text.
  my @original = grep ! /^\s*$|^> /, @{$m->{body}};
  $m->{_original_body_text} = \@original;

  my $first_few = $m->_get_body_head(4);
  return 1 if $m->has_spam_intro($first_few);

  my $last_few = $m->_get_body_tail(12);
  return 1 if $m->has_spam_ending($last_few);

  if ($last_few =~ /group.mail/i) {
    $m->reject_junk_mail('body suggested that a group mailer was used');
    return 1;
  }

  my $original = join '', @original;

  if ($original =~
        /this\s+(e?.?mail|message)\s+is\s+(being\s+)?sent\s+in\s+compliance/i) {
    $m->reject_junk_mail('body pretended that spam is legal');
    return 1;
  }

  if ($original =~ config->very_bad_words()) {
    $m->reject_junk_mail('body contained a very bad phrase', "`$1'");
    return 1;
  }

  if ($original =~ /\b ( \$\d{1,3}(,\d{3})+ | \${4}'s ) \b/x) {
    $m->reject_junk_mail('body mentioned dollars', "`$1'");
    return 1;
  }

  if ($original =~
        qr/(Hi!\s+How\s+are\s+you.+I\s+send\s+you\s+this\s+file\s+in\s+order\s+to\s+have\s+your\s+advice)/s) {
    $m->reject_junk_mail('body looked like virus', "`$1'");
    return 1;
  }

  if ($original =~ qr/(
                       # American phone numbers
                       \b (800|900|877|888) ([-\ ])
                       (
                         (?-i: [A-Z]{2,} (\2 [A-Z]+)* )          |
                         \d{3} \2 \d{4} \b
                       )                                         |
                       \b \(\d{3}\) \  \d{3}-\d{4} \b
                      )/) {
    $m->reject_junk_mail('body contained American phone #', "`$1'");
    return 1;
  }

  my $qbw = config->quite_bad_words();
  my @matches = ($original =~ /$qbw/g);
  my %uniques = map { lc $_ => $_ } @matches;
  log_to_file "Quite bad words found in body: ",
              scalar(@matches), " (", scalar(keys %uniques), " unique)\n"
    if @matches;

  if (@matches > config->max_quite_bad_words() &&
      scalar(keys %uniques) > config->max_unique_quite_bad_words()) {
    $m->reject_junk_mail('body contained too many bad words',
                         join ', ', map { "`$_'" } values %uniques);
    return 1;
  }

  return 0;
}

sub has_bad_content_type {
  my ($m) = @_;
  foreach my $type ($m->{content_type}, @{ $m->{content_types} || [] }) {
    if ($type =~ config->bad_content_type()) {
      $m->reject_junk_mail('bad Content-Type', $type);
      return 1;
    }
  }
  return 0;
}

sub _get_body_head {
  my ($m, $how_many) = @_;

  my @original = @{ $m->{_original_body_text} };
  
  my ($start, $end) = (0, $how_many);
  $end = $#original if $#original < $how_many;
  return join '', @original[$start .. $end];
}

sub _get_body_tail {
  my ($m, $how_many) = @_;

  my @original = @{ $m->{_original_body_text} };
  
  my ($start, $end) = (-$how_many, -1);
  if (@original < $how_many) {
    ($start, $end) = (0, $#original);
  }
  return join '', @original[$start .. $end];
}

sub too_many_forwards {
  my ($m) = @_;
  
  my $max = config->max_forwards();
  my @matches = ($m->{body_scalar} =~ /^\s*(>\s*){$max,}/mg);
#  log_to_file "Lines exceeding max_forwards: ", scalar(@matches), "\n";
  if (@matches > config->max_forwards_lines()) {
    $m->reject_junk_mail("forwarded more than "
                         . config->max_forwards()
                         . " times");
    return 1;
  }
  return 0;
}

sub has_spam_intro {
  my ($m, $first_few) = @_;

  my $me = config->me();
  if ($first_few =~ /^\s*
                      (Dear\ (
                        friend \b       |
                        .* surfer       |
                        $me
                      ))
                    /imx)
  {
    $m->reject_junk_mail('Suspicious method of address', "`$1'");
    return 1;
  }
  return 0;
}

sub has_spam_ending {
  my ($m, $last_few) = @_;

#   warn "remov: [$last_few] : " . ($last_few =~ /\bremoved?\b/i);
#   warn "2: " . $last_few =~ /respond|notify|reply|send|forward|click|software|
#                     mailto|type|return/ix;
#   warn "3: " . $last_few =~ /subjec?t|process|automatically/i;
  
  if ($last_few =~ /\bremoved?\b/i        &&
      $last_few =~ /respond|notify|reply|send|forward|click|software|
                    mailto|type|return|please/ix &&
      $last_few =~ /subjec?t|header|process|automatically/i) {
    $m->reject_junk_mail('body confessed it was junk');
    return 1;
  }
  return 0;
}


##############################################################################
#
# Routines providing actions to taken on the e-mail being filtered.
#

sub ditch_mail {
  my $m = shift;
  log_to_file "Delivered to /dev/null",
              @_ ? " (@_)" : '',
              "\n";
}

sub deliver_to_inbox {
  my $m = shift;
  my ($inbox) = @_;
  $m->deliver_to(config->inbox($inbox), @_);
}

sub maybe_backup {
  my $m = shift;

  $m->deliver_to(config->backup_folder()) if $m->{backup};
}

sub deliver_to {
  return if opt('wrong_class');

  my ($m, $folder) = @_;

  my $file = ($folder =~ m!^/!) ? $folder : config->mail_dir() . "/$folder";
  $file = folder_substs($file);

  if (opt('test_run')) {
    log_to_file "Would deliver to $file\n";
    return;
  }

  append_to_folder($file, $m);

  log_to_file "Delivered to $file\n";
}

sub accept_mail {
  return if opt('wrong_class');

  my $m = shift;
  my ($reason_ident, @details) = @_;

  $m->{accepted} = [ $reason_ident, @details ];

  my $text = "$reason_ident" .
             (@details ? " (@details)" : '');

  $m->{header}->add('X-Parp-Accepted', $text);
  log_to_file "Accepted: $text\n";
}

sub reject_junk_mail {
  return if opt('wrong_class');
  my $m = shift;
  $m->reject_mail(@_);
#  $m->{backup} = 0;
  $m->deliver_to_inbox('junk-mail');
}

sub reject_mail {
  return if opt('wrong_class');

  my $m = shift;
  my ($reason_ident, @details) = @_;

  $m->{rejected} = [ $reason_ident, @details ];

  my $text = "$reason_ident" .
             (@details ? " (@details)" : '') .
             "\n";

  $m->{header}->add('X-Parp-Rejected', $text);
  log_to_file "REJECTED: $text";
}

sub pipe_forward {
  return if opt('wrong_class');

  my $m = shift;
  my ($pipe_command) = @_;

  if (opt('dry_pipes') || opt('test_run')) {
    log_to_file "Would pipe | $pipe_command\n";
  }
  else {
    log_to_file "Piping | $pipe_command ... ";
    if (! open(PIPE, "| $pipe_command")) {
      error("Couldn't open pipe command $pipe_command: $!");
    }
    else {
      print PIPE $m->{mail}->as_mbox_string();
      close(PIPE) or error("close(| $pipe_command) failed: $!\n");
      log_to_file "done.\n";
    }
  }
}

1;
