procmail/spamfilter

   1 #!/usr/bin/procmail
   2
   3 #TODO: rewrite to use SPAM variable, and do not autotrain spam here, only ham
   4
   5 PMDIR=${PMDIR:-$HOME/.etc/mailfilter/procmail}
   6
   7 :0
   8 * !PMVAR ?? .
   9 {
  10   # PMVAR is not defined, so we are being called as filter
  11   # thus source the standard defines
  12   INCLUDERC=$PMDIR/defines
  13   # prevent feeding back to procmail and delete the leading From line
  14   PROCMAIL='/bin/cat'
  15   # and tell the fucking procmail piece-of-shit to continue to be a filter
  16   DEFAULT='|$PROCMAIL'
  17 }
  18
  19 #VERBOSE=yes
  20
  21 # no need to reprocess messages that went into a spamtrap
  22 # UPDATE: retrain them only if diagnosed as non-spam, see below
  23 # Note: add E flag to next recipe when uncommenting
  24 #:0
  25 #* SPAMTRAPPED ?? .
  26 #{
  27 #  LOG="spamfilter:  skipping checks for spamtrapped message$NL"
  28 #  :0 fw
  29 #  |$FORMAIL -I"X-Spam: spamtrapped"
  30 #}
  31
  32 # check whether this message is being reinjected
  33 TRAINED_AS
  34 :0
  35 *$ $REPROC_MESSAGE
  36 * ^X-Trained-As: \/(h|sp)am
  37 {
  38   LOG="spamfilter:  skipping already trained $MATCH$NL"
  39   :0
  40   * MATCH ?? spam
  41   { IS_SPAM=already-trained }
  42 }
  43
  44 # let earlier parts of the mailfilter cause bypassing the checks
  45 :0 E
  46 * SKIP_SPAMCHECKS ?? .
  47 {
  48   LOG="spamfilter:  skipping checks as requested: $SKIP_SPAMCHECKS$NL"
  49   :0 fw
  50   |$FORMAIL -I"X-Spam: unknown (skip requested)"
  51   SPAM_UNKNOWN=skip-requested
  52 }
  53
  54 # honour skip-spamchecks to exclude certain messages from spam checks
  55 # altogether
  56 :0 EBH
  57 * ? $EGREP -qif $CONF/skip-spamchecks
  58 {
  59   LOG="spamfilter:  skipping checks as per skip-spamchecks$NL"
  60   :0 fw
  61   |$FORMAIL -I"X-Spam: unknown (check skipped)"
  62   SPAM_UNKNOWN=skip-match
  63 }
  64
  65 # sanity check on message size
  66 :0 E
  67 * > $SPAMCHECK_MAX_MESSAGE_SIZE
  68 {
  69   LOG="spamfilter:  skipping check because message size exceeds $SPAMCHECK_MAX_MESSAGE_SIZE bytes$NL"
  70   :0 fw
  71   |$FORMAIL -I"X-Spam: unknown (message larger than $SPAMCHECK_MAX_MESSAGE_SIZE bytes)"
  72   SPAM_UNKNOWN=too-large
  73 }
  74
  75 # now run the spamfilters
  76 :0 E
  77 {
  78   INCLUDERC=$PMDIR/pre-spam-cleanup
  79
  80   # crm114
  81   CRM_SPAM=UNKNOWN
  82   CRM_SCORE=0
  83   :0
  84   * !SKIP_CRM ?? .
  85   {
  86     :0 fw
  87     |$CRM114
  88
  89     :0
  90     * ^X-CRM114-Status: \/[A-Z]+
  91     { CRM_SPAM=$MATCH }
  92
  93     :0
  94     * ^X-CRM114-Status: .+\([ ]*\/-?[.0-9]+
  95     { CRM_SCORE=$MATCH }
  96
  97     LOG="crm114:      $CRM_SPAM/$CRM_SCORE$NL"
  98   }
  99
 100   # spamassassin
 101   SA_STATUS=Unknown
 102   SA_SCORE=0
 103   SA_TESTS=none
 104   :0
 105   * !SKIP_SA ?? .
 106   {
 107     :0 fw
 108     |$SPAMC
 109
 110     :0
 111     * ^X-Spam-Status: \/[A-Za-z]+
 112     { SA_SPAM=$MATCH }
 113
 114     :0
 115     * ^X-Spam-Status: .+score=\/-?[.0-9]+
 116     { SA_SCORE=$MATCH }
 117
 118     :0
 119     * ^X-Spam-Status: .+tests=\/[^ ]+
 120     { SA_TESTS=$MATCH }
 121
 122     LOG="SA:          $SA_SPAM/$SA_SCORE/$SA_TESTS$NL"
 123   }
 124
 125   ## CASE 0: crm114 is unsure/untrained
 126   :0
 127   * CRM_SPAM ?? UNSURE
 128   {
 129     # retrain spamtrapped message
 130     :0
 131     * SPAMTRAPPED ?? .
 132     {
 133       LOG="spamfilter:  scheduling retraining with SPAM due to spamtrap$NL"
 134       :0 fw
 135       |$FORMAIL -A "X-CRM114-Autotrain: spam, due to spamtrap"
 136       RETRAIN=spam
 137     }
 138
 139     # retrain as ham
 140     :0 E
 141     * ? perl -e "$SA_SCORE <= $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM || exit 1"
 142     {
 143       LOG="spamfilter:  scheduling retraining with HAM (score $SA_SCORE <= $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM)$NL"
 144       :0 fw
 145       |$FORMAIL -A "X-CRM114-Autotrain: ham, according to SA (score $SA_SCORE <= $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM)"
 146       RETRAIN=ham
 147     }
 148
 149     # retrain as spam
 150     :0 E
 151     * 1^0 ? perl -e "$SA_SCORE > $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM || exit 1"
 152     {
 153       LOG="spamfilter:  scheduling retraining with SPAM (score $SA_SCORE > $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)$NL"
 154       :0 fw
 155       |$FORMAIL -A "X-CRM114-Autotrain: spam, according to SA (score $SA_SCORE > $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)"
 156       RETRAIN=spam
 157     }
 158
 159     # skip retraining if SA is not convinced
 160     :0 E
 161     {
 162       LOG="spamfilter:  will not autotrain crm114 because SA is not convinced ($CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM <= $SA_SCORE < $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)$NL"
 163       :0 fw
 164       |$FORMAIL -A "X-CRM114-Autotrain: SA is unsure ($CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM <= $SA_SCORE < $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)"
 165       SPAM_UNSURE=sa-unsure
 166     }
 167   }
 168
 169   ## CASE 1: disagreement, SA sees ham
 170   :0 E
 171   * CRM_SPAM ?? SPAM
 172   * SA_SPAM ?? No
 173   {
 174     # message was spamtrapped anyway
 175     :0
 176     * SPAMTRAPPED ?? .
 177     {
 178       LOG="spamfilter:  resolving crm114/SA disagreement due to spamtrap ($CRM_SCORE/$SA_SCORE)$NL"
 179       RETRAIN=spam
 180       :0 fw
 181       |$FORMAIL -A "X-CRM114-Retrain: spam, due to spamtrap"
 182     }
 183
 184     # SA is convincing, so retrain crm114
 185     :0 E
 186     * ? perl -e "$SA_SCORE <= $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_HAM || exit 1"
 187     {
 188       LOG="spamfilter:  crm114 found spam ($CRM_SCORE), but SA is more convincing ($SA_SCORE <= $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_HAM)$NL"
 189       RETRAIN=ham
 190       :0 fw
 191       |$FORMAIL -A "X-CRM114-Retrain: ham, according to SA (score $SA_SCORE <= $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_HAM)"
 192     }
 193
 194     # SA is not convincing, mark as disagreement
 195     :0 E
 196     {
 197       LOG="spamfilter:  crm114 found spam ($CRM_SCORE), but SA thinks it's ham ($SA_SCORE)$NL"
 198       SPAM_DISAGREE=sa-ham
 199       :0 fw
 200       |$FORMAIL -I "X-Spam: disagree (crm114:spam/$CRM_SCORE SA:ham/$SA_SCORE)"
 201     }
 202   }
 203
 204   ## CASE 1: disagreement, SA sees spam
 205   :0 E
 206   * CRM_SPAM ?? GOOD
 207   * SA_SPAM ?? Yes
 208   {
 209     # message was spamtrapped anyway
 210     :0
 211     * SPAMTRAPPED ?? .
 212     {
 213       LOG="spamfilter:  resolving crm114/SA disagreement due to spamtrap ($CRM_SCORE/$SA_SCORE)$NL"
 214       RETRAIN=spam
 215       :0 fw
 216       |$FORMAIL -A "X-CRM114-Retrain: spam, due to spamtrap"
 217     }
 218
 219     # SA is convincing, so retrain crm114
 220     :0
 221     * ? perl -e "$SA_SCORE > $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_SPAM || exit 1"
 222     {
 223       LOG="spamfilter:  crm114 found ham ($CRM_SCORE), but SA is more convincing ($SA_SCORE > $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_SPAM)$NL"
 224       RETRAIN=spam
 225       :0 fw
 226       |$FORMAIL -A "X-CRM114-Retrain: spam, according to SA (score $SA_SCORE > $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_SPAM)"
 227     }
 228
 229     # SA is not convincing, mark as disagreement
 230     :0 E
 231     {
 232       LOG="spamfilter:  crm114 found ham ($CRM_SCORE), but SA thinks it's spam ($SA_SCORE)$NL"
 233       SPAM_DISAGREE=sa-spam
 234       :0 fw
 235       |$FORMAIL -I "X-Spam: disagree (crm114:ham/$CRM_SCORE SA:spam/$SA_SCORE)"
 236     }
 237   }
 238
 239   :0 E
 240   * CRM_SPAM ?? SPAM
 241   * SA_SPAM ?? Yes
 242   {
 243     IS_SPAM=sa+crm
 244     :0 fw
 245     |$FORMAIL -I"X-Spam: yes (crm114:$CRM_SCORE SA:$SA_SCORE)"
 246   }
 247
 248   :0 Efw
 249   |$FORMAIL -I"X-Spam: no (crm114:$CRM_SCORE SA:$SA_SCORE)"
 250 }
 251
 252 # schedule spamtrapped ham for retraining as spam
 253 :0
 254 * SPAMTRAPPED ?? .
 255 * ! IS_SPAM ?? .
 256 {
 257   LOG="spamfilter:  found spamtrapped ham, retraining...$NL"
 258   :0 fw
 259   |$FORMAIL -I"X-Spam: spamtrapped ham"
 260   IS_SPAM=spamtrapped-ham
 261   RETRAIN=spam
 262 }
 263
 264 INCLUDERC=$PMDIR/handlespam
 265 #VERBOSE=no