package Parp::Mail;

=head1 NAME

Parp::Mail - Mail::Internet decorator providing filtering functionality

=head1 SYNOPSIS

See L<Parp::Filter>.

=head1 DESCRIPTION

This class decorates Mail::Internet objects, providing them with
filtering functionality whilst avoiding polluting the namespace
of the Mail::Internet objects themselves.

=cut

use strict;
use warnings;

use Carp;
use Digest::MD5 qw(md5_base64);
use Mail::Address;
use Time::Local;

use Parp::Config qw(config);
use Parp::Options qw(opt);
use Parp::Utils qw(vprint diagnose check_file_dir month2i error fatal);

use base qw(Parp::Mail::Deliverable Parp::Mail::Friends
            Parp::Mail::Tests::Header Parp::Mail::Tests::Body);

use overload '""' => \&to_string;

=head1 CONSTRUCTORS

=head2 new($mail)

C<$mail> is the Mail::Internet object to be decorated.
Returns a Parp::Mail object which wraps around it.

=cut

sub new {
  my ($this, $mail) = @_;
  my $class = ref($this) || $this;
  my $self = bless {}, $class;
  my $m = $self; # syntactic sugar

  my $header = $mail->head;
  $m->{content_type} = $header->get('Content-Type') || '';
  $m->{mail}   = $mail;
  $m->{header} = $header;
  $m->{body}   = $self->_get_body($mail->body);
  $m->{body_scalar} = join '', map { s/^>From/From/; $_ } @{$m->{body}};

  $self->_calc_common_headers;
  $self->_check_resent_headers;
  $self->_check_envelope;
  $self->_extract_addrs;
  
  $m->{recvds_array}    = [ $header->get('Received') ];
  $m->{recvds}          = join '', @{$m->{recvds_array}};

  $self->_check_auth_sender;

  # Remove all previously existing parp headers except X-Parp-Id
  my @parp_headers = grep /^X-Parp-(?!Id)/, $m->header->tags;
  foreach my $parp_header (sort @parp_headers) {
    $m->{old_parp_headers}{$parp_header} = $m->header->get($parp_header)
      if opt('regression_test');
    $m->header->delete($parp_header);
  }

  $self->_do_parp_headers;
  
  $m->{backup}   = 1;     # back up by default
  $m->{complain} = 1;     # allow complaining by default

  return $self;
}

=head1 METHODS

=head2 Header accessors

The following methods are provided as quick accessors for the mail's
headers.  They return the value of the corresponding header (the
corresponding header is listed if it's not obvious).  For each one,
there is also a method obtained by prefixing the name with C<full_>,
which returns the complete, new-line terminated header.  For example,
if a mail C<$m> had an X-Mailing-List header:

  X-Mailing-List: mutt-users@mutt.org

then C<$m-E<gt>list> would return C<mutt-users@mutt.org>, and
C<$m-E<gt>full_list> would return the complete header verbatim.

=over 4

=item * from

=item * to 

=item * cc

=item * subject

=item * return_path

=item * reply_to

=item * list

Returns the value of the X-Mailing-List header.

=item * in_reply_to

=item * refs

Returns the value of the References header.

=item * id

Returns the value of the Message-ID header.

=item * date

=item * status

=item * a_to

Returns the value of the Apparently-To header.

=item * precedence

=item * mailer

=back

In addition to the above, the following methods are provided:

=over 4

=item * full_mail_from

Returns the mbox-style envelope (the C<From > header).

=item * mail_from

Ditto, but minus the leading C<From >.

=item * env_from

Like C<mail_from>, but also minus the trailing date.

=item * env_from_addr

Returns just the address contained in the envelope.

=item * from_addr

Returns the address contained in the C<From:> header.

=item * ftc_addrs

Returns an array of all addresses mentioned in any of the envelope,
C<From:>, C<To:>, C<Cc:>, C<X-Mailing-List>, C<Sender:>, and
C<X-Sender:> headers.

=item * sender

Returns the value of the C<Sender:> header, or if that is empty, the
value of the C<X-Sender:> header.

=item * mailer

Returns the value of the C<User-Agent:> header, or if that is empty,
the value of the C<X-Mailer:> header.

=item * organisation

Returns the value of the C<Organisation:> header, or if that is empty,
the value of the C<Organization:> header :-)

=back

=cut

my %headers = (
  parp_id      => 'X-Parp-Id',        from         => 'From',
  to           => 'To',               cc           => 'Cc',
  subject      => 'Subject',          return_path  => 'Return-Path',
  reply_to     => 'Reply-To',         list         => 'X-Mailing-List',
  in_reply_to  => 'In-Reply-To',      refs         => 'References',
  id           => 'Message-ID',       date         => 'Date',
  status       => 'Status',           a_to         => 'Apparently-To',
  precedence   => 'Precedence',       mailer       => 'X-Mailer',
);
my @methods = qw(mail_from env_from sender organisation);

# autogenerate read-only accessors
foreach my $method (keys(%headers), @methods) {
  no strict 'refs';
  *$method = sub { $_[0]->{cached_headers}{$method} };
}
foreach my $method (keys %headers) {
  no strict 'refs';
  my $full = 'full_' . $method;
  *$full = sub { $headers{$method} . ': ' . $_[0]->$method . "\n" };
}
sub full_mail_from { 'From ' . shift->mail_from . "\n" }

foreach my $method (qw/mail header/) {
  no strict 'refs';
  *$method = sub { $_[0]->{$method} };
}

sub _calc_common_headers {
  my $m = shift;

  my $header = $m->header;

  # It's easier if all the headers we deal with are single lines.
  # In particular, don't fold `From ' header, as we want that to match
  # a date regexp below.
  $header->unfold;

  # envelope from
  $header->mail_from('KEEP');
  if ($m->can('fromLine')) {
    # got a Mail::Box::Mbox::Message
    (my $mail_from = $m->mail->fromLine) =~ s/^From //;
    $m->_cache_header_val(mail_from => $mail_from);
  }
  else {
    $m->_cache_header(mail_from => 'From ', 'Mail-From');
  }

  while (my ($method, $hdr) = each %headers) {
    next if $method eq 'mailer';
    $m->_cache_header($method => $hdr);
  }

  $m->_cache_header(sender => qw/Sender X-Sender/);
  $m->_cache_header(mailer => qw/User-Agent X-Mailer/);
  $m->_cache_header(organisation => qw/Organisation Organization/);
}

sub _cache_header_val {
  my ($m, $header, $val) = @_;
  carp "$header already cached as `$m->{cached_headers}{$header}'\n"
    if $m->{cached_headers}{$header};
  chomp($m->{cached_headers}{$header} = $val);
}

sub _cache_header {
  my ($m, $header, @headers) = @_;
  carp "$header already cached as `$m->{cached_headers}{$header}'\n"
    if $m->{cached_headers}{$header};
  foreach my $h (@headers) {
    $m->{cached_headers}{$header} ||= $m->header->get($h);
    last if $m->{cached_headers}{$header};
  }
  $m->{cached_headers}{$header} ||= '';
  chomp $m->{cached_headers}{$header};
}

sub _do_parp_headers {
  my $m = shift;
  
  # Add a header for the process id to try to chase down obscure bugs.
  $m->header->add('X-Parp-pid', $$);
#  system("/bin/date >> ~/mail/.parp.pstree");
#  system("pstree >> ~/mail/.parp.pstree");

  $m->_calc_parp_id;
}

sub _calc_parp_id {
  my $m = shift;

  # Calculate a unique id which parp can always refer to.  We
  # calculate the MD5 digest of enough of the whole mail to ensure a
  # unique id, but without any bits which might change in some way
  # during the e-mail's life-span.  That guarantees that during any
  # subsequent reclassification of the e-mail as a false
  # positive/negative (in the spam detection sense) this unique id
  # will match with the original, so that the statistics calculation
  # program will work.
  
  my @immutable_headers = qw/mail_from from to cc subject
                             return_path reply_to in_reply_to
                             refs id date mailer/;
  my $immutable_header
    = join '', map { my $fhdr = "full_$_"; $m->$fhdr } @immutable_headers;
  $immutable_header .= $m->{recvds};

  my $immutable = $immutable_header .
                  "\n" .
                  $m->{body_scalar};
  $immutable =~ s/\n+$/\n/;

  my $time    = $m->{env_from_time} || time;
  my $parp_id = $time . "/" . md5_base64($immutable);

  # It was a PAIN to get construction of the immutable right.
  # Uncommenting this enables debugging of it if anything goes wrong.
#  $self->_write_immutable($immutable, $parp_id);

  if ($m->{parp_id}) {
    # This e-mail has already been run through parp, so it already has
    # an X-Parp-Id header.

    if ($m->{parp_id} ne $parp_id) {
      # Better check that the id we've just calculated is the same,
      # otherwise our calculation algorithm is in trouble.
#use Data::Dumper;
      error("Message already had a parp id of: $m->{parp_id}\n" .
            "       but recalculation yielded: $parp_id",
            #"\$m:\n", Dumper($m),
           );
    }
  }
  else {
    # This e-mail hasn't been touched by parp before, so stamp it with
    # a parp id.
    $m->header->add('X-Parp-Id', $parp_id);
    $m->{parp_id} = $parp_id;
  }
}

sub to_string {
  my $self = shift;
  return sprintf "parp e-mail (id %s)", $self->{parp_id};
}

sub _write_immutable {
  my $self = shift;
  my ($immutable, $parp_id) = @_;

  my $immutables_dir = config->mail_dir . '.immutables';
  return unless -d $immutables_dir;
  
  my $id_file = $parp_id;
  $id_file =~ s!/!_!g;
  $id_file = "$immutables_dir/$id_file";
    
  while (-e $id_file) {
    # Generate a unique suffix
    $id_file =~ s/(?:\.(\d+))?$/"." . (($1 || 0) + 1)/e;
  }
    
  if (open(FOO, ">$id_file")) {
    print FOO $immutable;
    close(FOO);
  } else {
    error("Couldn't open $id_file for writing: $!");
  }
}  

sub _get_body {
  my ($m, $body) = @_;

  return $body unless $m->{content_type} =~ m!^multipart/.*boundary=(.*)\n!s;

  # Deal with MIME multipart messages without using a very slow
  # parser from CPAN ...
  my $boundary = $1;
  $boundary =~ s/^"(.*)"$/$1/;
  $boundary = quotemeta $boundary;
  diagnose qq{Message is multipart; splitting on boundary "$1".\n};
  
  my @parts = split /--$boundary(?:--)?\n?/m, join('', @$body);

  diagnose "Deleting non-text parts ... \n";
  my @body_lines = ();
  foreach my $part (@parts) {
    my @lines = split /(?<=\n)/, $part;
    my $part_mail = new Mail::Internet(\@lines);
    next unless @lines;
    my $content_type = $part_mail->get('Content-Type');
    if ($content_type) {
      chomp $content_type;
      diagnose "Content-Type: $content_type";
      push @{ $m->{content_types} }, $content_type;

      if ($content_type !~ m!^text/\b!) {
        diagnose "; skipping ...\n";
        next;
      } else {
        diagnose "\n";
      }
    } else {
      $content_type = '_unspecified_';
      diagnose "Warning: Content-Type was unspecified; assuming plain text.\n";
    }

    push @body_lines, @{ $part_mail->body };
  }

  return \@body_lines;
}

sub _check_resent_headers {
  my ($self) = @_;

  # From RFC822:
  #
  # --------- 8< --------- 8< --------- 8< --------- 8< --------- 8< ---------
  #    4.2.  FORWARDING
  #  
  #         Some systems permit mail recipients to  forward  a  message,
  #    retaining  the original headers, by adding some new fields.  This
  #    standard supports such a service, through the "Resent-" prefix to
  #    field names.
  #  
  #         Whenever the string "Resent-" begins a field name, the field
  #    has  the  same  semantics as a field whose name does not have the
  #    prefix.  However, the message is assumed to have  been  forwarded
  #    by  an original recipient who attached the "Resent-" field.  This
  #    new field is treated as being more recent  than  the  equivalent,
  #    original  field.   For  example, the "Resent-From", indicates the
  #    person that forwarded the message, whereas the "From" field indi-
  #    cates the original author.
  #  
  #         Use of such precedence  information  depends  upon  partici-
  #    pants'  communication needs.  For example, this standard does not
  #    dictate when a "Resent-From:" address should receive replies,  in
  #    lieu of sending them to the "From:" address.
  #  
  #    Note:  In general, the "Resent-" fields should be treated as con-
  #           taining  a  set  of information that is independent of the
  #           set of original fields.  Information for  one  set  should
  #           not  automatically be taken from the other.  The interpre-
  #           tation of multiple "Resent-" fields, of the same type,  is
  #           undefined.
  # --------- 8< --------- 8< --------- 8< --------- 8< --------- 8< ---------
  #
  # So we only take values from Resent- headers when we can't get them
  # any other way but we really would prefer to have them.

  my %resent_headers = ( id => 'Message-ID' );
  foreach my $header_key (keys %resent_headers) {
    my $header_name = $resent_headers{$header_key};
    $self->{$header_key} ||= $self->{header}->get("Resent-$header_name") || '';
  }
}

sub _check_envelope {
  my ($m) = @_;

  if (! $m->mail_from) {
    error('Envelope From header missing', <<QMAIL);
If you are using qmail as your MTA, make sure your .qmail setup passes
the mail through the preline filter before being passed to parp, e.g.

  | preline /path/to/parp -dr

Otherwise, parp may not be compatible with your MTA.  Does it deliver
in mbox format?
QMAIL
  }

  if ($m->mail_from =~
      /(.*?)\s*(\w{3}) (\w{3}) ([\d ]\d) (\d\d):(\d\d):(\d\d) (\d{4})$/) {
    $m->_cache_header_val(env_from => $1);
    my ($dow, $month, $mday, $hour, $min, $sec, $year) =
      ($2, $3, $4, $5, $6, $7, $8);

    $m->{env_from_time} =
      timelocal($sec, $min, $hour, $mday, month2i($month), $year);
  }
  else {
    $m->_cache_header_val(env_from => $m->mail_from);
    error("Envelope From header had weird date format", $m->mail_from);
  }
}

sub _extract_addrs {
  my ($m) = @_;

  # Certain headers should only have one address.
  my @singletons = qw/env_from from sender list reply_to return_path id/;

  my %addrs = ();
  foreach my $hdr (@singletons, qw/to cc/) {
    $addrs{$hdr} = [ map { $_->address } Mail::Address->parse($m->$hdr) ];
  }

  my @broken = ();
  foreach my $hdr (@singletons) {
    push @broken, [ $hdr, $addrs{$hdr} ] if @{ $addrs{$hdr} } > 1;
    $addrs{$hdr} = $addrs{$hdr}[0];
  }
  error("Some headers had more than one address",
        map $m->_broken_headers($_), @broken)
    if @broken;

  $addrs{ftc}       = [ grep $_, @addrs{qw/env_from from list sender/},
                                 @{ $addrs{to} }, @{ $addrs{cc} }       ];
  $addrs{froms}     = [ grep $_, @addrs{qw/env_from from/}              ];
  $addrs{all_froms} = [ grep $_, @addrs{qw/env_from from reply_to
                                           return_path sender id/}      ];
  $m->{addrs} = \%addrs;
}

sub _broken_headers {
  my $m = shift;
  my ($broken) = @_;
  my ($hdr, $addrs) = @$broken;
  my $full = 'full_' . $hdr;
  return $m->$full . join '', map { "  + $_\n" } @$addrs;
}

sub from_addr     { shift->{addrs}{from}     }
sub env_from_addr { shift->{addrs}{env_from} }
sub ftc_addrs     { shift->{addrs}{ftc}      }

sub _check_auth_sender {
  my ($self) = @_;

  my @comments        = $self->{header}->get('Comments');
  $self->{comments}        = \@comments;

  $self->{auth_sender}     = '';
  foreach my $comment (@comments) {
    if ($comment =~ /^Authenticated sender is (.*)/i) {
      $self->{auth_sender} = $1;
      last;
    }
  }
}

=head2 Inherited methods

Parp::Mail inherits from the following classes, each of which
provides additional filtering methods:

=over 4

=item * Parp::Mail::Deliverable

=item * Parp::Mail::Friends
            
=item * Parp::Mail::Tests::Header

=item * Parp::Mail::Tests::Body

=back

=cut

1;
