The log file we will be parsing is the standard procmail log. An authentic example of the log file from our main mail server (that receives around ~10,000 spam mails per month) is shown below (slightly anonymized to remove the real name of the mailbox)
... From sterne@gvt.net.br Thu Jul 16 00:03:32 2009 Subject: *** SPAM (5.3) *** =?koi8-r?B?79DUyc3J2sHDydEg08HK1ME=?= Folder: /dev/null 14278 From yhrtjgommsg@boomboomroom.com Thu Jul 16 00:13:33 2009 Subject: *** SPAM (6.7) *** Women will be begging you to sleep with you. Folder: /srv/mail/john/.Spam/new/1247696013.18366_0.lambda 2070 From ErikaFrazier12@aol.com Thu Jul 16 00:13:33 2009 Subject: *** SPAM (41.5) *** Obama Allows Meds Sold Online Folder: /dev/null ...
Each entry for a received mail in the log are three lines, sender, subject and the
folder the mail will be stored in. For mails that are identified with (according to
this setup) 100% confidence as spams the destination is set to
/dev/null
, i.e they are immediately deleted. For mails
that are determined to be spams but where there might be a chance that they are
legitimate they are store in the users Spam folder. From the above log we can see
that two mails are deleted immediately and one mail is stored in the users Spam
folder.
To analyze this we will create the class ParseProcmailLogFile
the
constructor will take a file name of the log file as the only argument and to get
hold of the statistics we use the method
ParseProcmailLogFile::GetStat($aWindowSize)
$aWindowSize
is the number of days back the stats should
be based on. The returned statistics will be an array with three array
elements with the following layout:
($dateArray, nbrDevNull, nbrSpamFolder)
These returned values are both arrays indexed by date and value the number of spams for that date (key)
Remember that the PHP process or user running the script must have read privileges for the log file.
We will not walk through the parsing class in any more details than what is given in the phpdoc comments in the source below
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | /** * Class ParseProcmailLogFile */ class ParseProcmailLogFile { private $iFileName=''; /** * Constructor for the parse class * * @param mixed $aFileName Log file to read * @return ParseProcmailLogFile */ function __construct($aFileName) { $this->iFileName = $aFileName ; } /** * Get line without trailing "\n" * * @param mixed $fp Filepointer * @return string Read line without trailing "\n" */ function GetLine($fp) { $s = fgets($fp); return substr($s,0,strlen($s)-1); } /** * Get statistics from the parsed log file * * @param $aWindow Window size. How many days to include in the returned stats * @return array with (date, number of killed spam, number of non killed spams) */ function GetStat($aWindow) { $fp = fopen($this->iFileName,'r'); if( $fp === false ) { JpGraphError::Raise('Cannot read log file'); } // Now loop through the file. We always keep the last 3 lines in // the buffer to be able to get the context of a line since the // folder is stored on one line and the date of the main on the // previous line $buf[1] = $this->GetLine($fp); $buf[2] = $this->GetLine($fp); $idx = 0; $idx2 = 0; $found = array(); // All /dev/null spam headers $found2 = array(); // All Spam folder headers // Loop through all lines in the file and store the found emails // in the two $found arrays while( ! feof($fp) ) { //Shift buffer one step $buf[0] = $buf[1]; $buf[1] = $buf[2]; $buf[2] = $this->GetLine($fp); // Find /dev/null entries if( strpos($buf[2],'Folder: /dev/null') !== false ) { if( strpos($buf[0],'From ') !== false ) { $datepos = 0 ; } elseif( strpos($buf[1],'From ') !== false ) { $datepos = 1 ; } else { continue; } // Aggregate all the data per day $date = strtotime(date('D j M Y',strtotime(substr($buf[$datepos],-24)))); $found[$idx++] = array(str_replace(' Subject: ','',$buf[1]),$date); } // Find spam folder entries if( strpos($buf[2],'.Spam') !== false ) { if( strpos($buf[0],'From ') !== false ) { $datepos = 0 ; } elseif( strpos($buf[1],'From ') !== false ) { $datepos = 1 ; } else { continue; } // Aggregate all the data per day $date = strtotime(date('D j M Y',strtotime(substr($buf[$datepos],-24)))); $found2[$idx2++] = array(str_replace(' Subject: ','',$buf[1]),$date); } } fclose($fp); // Find out how many at each day of dev null $date = $found[0][1]; $daystat[$date] = 0; for($i=0; $i < $idx; ++$i ) { if( $date == $found[$i][1] ) { ++$daystat[$date]; } else { $date = $found[$i][1]; $daystat[$date] = 1; } } // Find out how many at each day of spam $daystat2 = array(); if( count($found2) > 0 ) { $date = $found2[0][1]; $daystat2[$date] = 0; for($i=0; $i < $idx2; ++$i ) { if( $date == $found2[$i][1] ) { ++$daystat2[$date]; } else { $date = $found2[$i][1]; $daystat2[$date] = 1; } } } // Now make sure that both arrays have the same // number of entries. foreach( $daystat as $key => $val ) { if( !isset($daystat2[$key]) ) { $daystat2[$key] = 0; } } foreach( $daystat2 as $key => $val ) { if( !isset($daystat[$key]) ) { $daystat[$key] = 0; } } // Window and return the data $n = count($daystat); $start = $n > $aWindow ? $n - $aWindow : 0; $date_keys = array_keys($daystat); $idx=0; $datax = array(); $datay1 = array(); $datay2 = array(); for( $i=$start; $i < $n; ++$i ) { $datax[$idx] = date('D j M',$date_keys[$i]); $datay1[$idx] = $daystat[$date_keys[$i]]; $datay2[$idx++] = $daystat2[$date_keys[$i]]; } return array($datax,$datay1,$datay2); } } |