#!/usr/bin/perl -w
#
### Introduction ###
# sa-stats   : SpamAssassin logfile analyser
# Version    : 1.1.7
# Date       : 30th March, 2006
# Author     : David Ramsden, david@hexstream.co.uk
# Credits    : - Andrew Berry for asking about such a
#                script and giving me the idea.
#              - D. Scott Barninger for producing a patch
#                to fix the strftime() problem in <= 1.1.4
#              - Mike Jackson for his patch to add SA 3.1.0
#                compatibility and the limit option.
#              - Jack Scagnetti for two fixes.
#
# Usage:
#   ./sa-stats.pl -h
#
#   sa-stats.pl reads input in from STDIN:
#       ./sa-stats.pl </var/log/mail.info
#       cat /var/log/mail.info | ./sa-stats.pl -t
#
#   Note: Not every Linux/UNIX distribution uses /var/log/mail.info
#         It may use /var/log/maillog
#         You will need to work this out for yourself.
#
### License (distributed under the zlib license) ###
# Copyright (c) 2006 David Ramsden
#
# This software is provided 'as-is', without any express or implied
# warranty. In no event will the authors be held liable for any damages
# arising from the use of this software.
#
# Permission is granted to anyone to use this software for any purpose,
# including commercial applications, and to alter it and redistribute it
# freely, subject to the following restrictions:
#
#       1. The origin of this software must not be misrepresented; you must
#       not claim that you wrote the original software. If you use this
#       software in a product, an acknowledgment in the product documentation
#       would be appreciated but is not required.
#
#       2. Altered source versions must be plainly marked as such, and must
#       not be misrepresented as being the original software.
#
#       3. This notice may not be removed or altered from any source
#       distribution.
#
###
use strict;
use Getopt::Long;
use POSIX qw(strftime);
### Global variable definitions ###
my %global_stats = ();
my %user_stats   = ();
my $filter       = undef;
my $userlimit    = 65534;
### Command line arguements ###
GetOptions("help|h" =>\ my $opt_help,
           "today|t" =>\ my $opt_today,
           "yesterday|y" =>\ my $opt_yesterday,
           "limit|l=i" =>\ my $opt_limit);
if ($opt_help)
{
        print "Usage: $0 [options] </var/log/mail.info\n";
        print "Without any options the entire logfile is processed\n";
        print "Options can be any of the following:\n";
        printf("\t%-5s: Stats for todays date only.\n", "t");
        printf("\t%-5s: Stats for yesterdays date only.\n", "y");
        printf("\t%-5s: Stats for top n mail recipients.\n", "l n");
        exit 0;
}
if ($opt_today && $opt_yesterday)
{
        print "You can't use the -t option and -y option together.\n";
        exit 1;
}
if ($opt_today)
{
        $filter = strftime("%b %e", localtime);
}
elsif ($opt_yesterday)
{
        $filter = strftime("%b %e", localtime(time() - (24 * 60 * 60)));
}
if ($opt_limit) {
        $userlimit = $opt_limit;
}
### Main code ###
while(defined(my $line = <STDIN>))
{
        if (defined($filter))
        {
                if ($line !~ /^$filter/)
                {
                        next;
                }
        }
        if ($line =~ m/spamd\[([0-9].*?)\]: clean message \((.*?)\/(.*?)\) for (.*?):/)
        {
                &update_stats("ham", $4, $2, $3);
        }
        elsif ($line =~ m/spamd: clean message \((.*?)\/(.*?)\) for (.*?):/)
        {
                &update_stats("ham", $3, $1, $2);
        }
        elsif ($line =~ m/spamd\[([0-9].*?)\]: identified spam \((.*?)\/(.*?)\) for (.*?):/)
        {
                &update_stats("spam", $4, $2, $3);
        }
        elsif ($line =~ m/spamd: identified spam \((.*?)\/(.*?)\) for (.*?):/)
        {
                &update_stats("spam", $3, $1, $2);
        }
}
&show_stats();
exit 0;
### Sub-routines ###
sub show_stats()
{
        if ($opt_today)
        {
                print "SpamAssassin statistics for today ($filter)\n";
        }
        elsif ($opt_yesterday)
        {
                print "SpamAssassin statistics for yesterday ($filter)\n";
        }
        else
        {
                print "SpamAssassin statistics for entire logfile\n";
        }
        draw_line(70);
        print "\n";
        if (exists($global_stats{'spam'}) && exists($global_stats{'ham'}))
        {
                printf("%-30s %-10s %-10s %-10s\n", "Total messages:", "Ham:", "Spam:", "% Spam:");
                draw_line(70);
                printf("%-30d %-10d %-10d %1.2f%%\n", $global_stats{'ham'}{'count'} + $global_stats{'spam'}{'count'},
                                                      $global_stats{'ham'}{'count'},
                                                      $global_stats{'spam'}{'count'},
                                                      100 * ($global_stats{'spam'}{'count'} / ($global_stats{'ham'}{'count'} + $global_stats{'spam'}{'count'})));
                print "\n";
                printf("%-30s: %1.2f/%1.2f\n", "Average spam score",
                                               $global_stats{'spam'}{'score'} / $global_stats{'spam'}{'count'},
                                               $global_stats{'spam'}{'threshold'} / $global_stats{'spam'}{'count'});
                printf("%-30s: %1.2f/%1.2f\n", "Average ham score",
                                               $global_stats{'ham'}{'score'} / $global_stats{'ham'}{'count'},
                                               $global_stats{'ham'}{'threshold'} / $global_stats{'ham'}{'count'});
        }
        else
        {
                if (!exists($global_stats{'ham'}))
                {
                        print "No ham (clean) messages found in logfile.\n";
                }
                if (!exists($global_stats{'spam'}))
                {
                        print "No spam (identified) messages found in logfile.\n";
                }
                print "Due to the above, not enough information is available to calculate\nglobal statistics.\n";
        }
        print "\n";
        printf("%-30s %-7s %-7s %-7s %-7s\n", "Username:", "Total:", "Ham:", "Spam:", "% Spam:");
        draw_line(70);
        if($opt_limit)
        {
                my $userdisp = 0;
                my %sorthash;
                foreach my $username (sort keys %user_stats)
                {
                        $sorthash{$username} = $user_stats{$username}{'ham'}{'count'} + $user_stats{$username}{'spam'}{'count'};
                }
                foreach my $username (sort { $sorthash{$b} <=> $sorthash{$a} } keys %user_stats)
                {
                        printf("%-30s %-7d %-7d %-7d %1.2f%%\n", $username,
                                                                    $user_stats{$username}{'ham'}{'count'} + $user_stats{$username}{'spam'}{'count'},
                                                                    $user_stats{$username}{'ham'}{'count'},
                                                                    $user_stats{$username}{'spam'}{'count'},
                                                                    100 * ($user_stats{$username}{'spam'}{'count'} / ($user_stats{$username}{'ham'}{'count'} + $user_stats{$username}{'spam'}{'count'})));
                        $userdisp++;
                        last if $userdisp >= $userlimit;
                }
        }
        else
        {
                foreach my $username (sort keys %user_stats)
                {
                        printf("%-30s %-7d %-7d %-7d %1.2f%%\n", $username,
                                                                    $user_stats{$username}{'ham'}{'count'} + $user_stats{$username}{'spam'}{'count'},
                                                                    $user_stats{$username}{'ham'}{'count'},
                                                                    $user_stats{$username}{'spam'}{'count'},
                                                                    100 * ($user_stats{$username}{'spam'}{'count'} / ($user_stats{$username}{'ham'}{'count'} + $user_stats{$username}{'spam'}{'count'})));
                }
        }
        print "\n";
        printf("%-30s %-20s %-20s\n", "Username:", "Avg. ham score:", "Avg. spam score:");
        draw_line(70);
        if($opt_limit)
        {
                my $userdisp = 0;
                my %sorthash;
                foreach my $username (sort keys %user_stats)
                {
                        $sorthash{$username} = $user_stats{$username}{'ham'}{'count'} + $user_stats{$username}{'spam'}{'count'};
                }
                foreach my $username (sort { $sorthash{$b} <=> $sorthash{$a} } keys %sorthash)
                {
                        my $ham_average = "None";
                        my $spam_average = "None";
                        if ($user_stats{$username}{'ham'}{'count'})
                        {
                                $ham_average = sprintf("%1.2f/%1.2f", $user_stats{$username}{'ham'}{'score'} / $user_stats{$username}{'ham'}{'count'},
                                                                        $user_stats{$username}{'ham'}{'threshold'} / $user_stats{$username}{'ham'}{'count'});
                        }
                        if ($user_stats{$username}{'spam'}{'score'})
                        {
                                $spam_average = sprintf("%1.2f/%1.2f", $user_stats{$username}{'spam'}{'score'} / $user_stats{$username}{'spam'}{'count'},
                                                                       $user_stats{$username}{'spam'}{'threshold'} / $user_stats{$username}{'spam'}{'count'});
                        }
                        printf("%-30s %-20s %-20s\n", $username, $ham_average, $spam_average);
                        $userdisp++;
                        last if $userdisp >= $userlimit;
                }
        }
        else
        {
                foreach my $username (sort keys %user_stats)
                {
                        my $ham_average = "None";
                        my $spam_average = "None";
                        if ($user_stats{$username}{'ham'}{'count'})
                        {
                                $ham_average = sprintf("%1.2f/%1.2f", $user_stats{$username}{'ham'}{'score'} / $user_stats{$username}{'ham'}{'count'},
                                                                        $user_stats{$username}{'ham'}{'threshold'} / $user_stats{$username}{'ham'}{'count'});
                        }
                        if ($user_stats{$username}{'spam'}{'score'})
                        {
                                $spam_average = sprintf("%1.2f/%1.2f", $user_stats{$username}{'spam'}{'score'} / $user_stats{$username}{'spam'}{'count'},
                                                                       $user_stats{$username}{'spam'}{'threshold'} / $user_stats{$username}{'spam'}{'count'});
                        }
                        printf("%-30s %-20s %-20s\n", $username, $ham_average, $spam_average);
                }
        }
}
sub update_stats()
{
        my $stat = shift;
        my $username = shift;
        my $score = shift;
        my $threshold = shift;
        $username = lc($username);
        if (!exists($global_stats{$stat}))
        {
                $global_stats{$stat}{'count'} = 0;
                $global_stats{$stat}{'score'} = 0;
                $global_stats{$stat}{'threshold'} = 0;
        }
        $global_stats{$stat}{'count'}++;
        $global_stats{$stat}{'score'} += $score;
        $global_stats{$stat}{'threshold'} += $threshold;
        if (!exists($user_stats{$username}))
        {
                $user_stats{$username}{'ham'}{'count'} = 0;
                $user_stats{$username}{'ham'}{'score'} = 0;
                $user_stats{$username}{'ham'}{'threshold'} = 0;
                $user_stats{$username}{'spam'}{'count'} = 0;
                $user_stats{$username}{'spam'}{'score'} = 0;
                $user_stats{$username}{'spam'}{'threshold'} = 0;
        }
        $user_stats{$username}{$stat}{'count'}++;
        $user_stats{$username}{$stat}{'score'} += $score;
        $user_stats{$username}{$stat}{'threshold'} += $threshold;
}
sub draw_line()
{
        my $length = shift;
        print "-" x $length;
        print "\n";
}