]> git.madduck.net Git - etc/mailfilter.git/blob - procmail/spamfilter

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

rename OLD_MESSAGE to REPROC_MESSAGE
[etc/mailfilter.git] / procmail / spamfilter
1 #!/usr/bin/procmail
2
3 #TODO: rewrite to use SPAM variable, and do not autotrain spam here, only ham
4
5 PMDIR=${PMDIR:-$HOME/.etc/mailfilter/procmail}
6
7 :0
8 * !PMVAR ?? .
9 {
10   # PMVAR is not defined, so we are being called as filter
11   # thus source the standard defines
12   INCLUDERC=$PMDIR/defines
13   # prevent feeding back to procmail and delete the leading From line
14   PROCMAIL='/bin/cat'
15   # and tell the fucking procmail piece-of-shit to continue to be a filter
16   DEFAULT='|$PROCMAIL'
17 }
18
19 #VERBOSE=yes
20
21 INCLUDERC=$PMDIR/pre-spam-cleanup
22
23 # no need to reprocess messages that went into a spamtrap
24 # UPDATE: retrain them only if diagnosed as non-spam, see below
25 #:0
26 #* SPAMTRAPPED ?? .
27 #{ 
28 #  LOG="spamfilter:  skipping checks for spamtrapped message$NL"
29 #  :0 fw
30 #  |$FORMAIL -I"X-Spam: spamtrapped"
31 #}
32
33 # let earlier parts of the mailfilter cause bypassing the checks
34 :0 E
35 * SKIP_SPAMCHECKS ?? .
36
37   LOG="spamfilter:  skipping checks as requested: $SKIP_SPAMCHECKS$NL" 
38   :0 fw
39   |$FORMAIL -I"X-Spam: unknown (skip requested)"
40   SPAM_UNKNOWN=skip-requested
41 }
42
43 # honour skip-spamchecks to exclude certain messages from spam checks
44 # altogether
45 :0 EBH
46 * ? $EGREP -qif $CONF/skip-spamchecks
47 {
48   LOG="spamfilter:  skipping checks as per skip-spamchecks$NL"
49   :0 fw
50   |$FORMAIL -I"X-Spam: unknown (check skipped)"
51   SPAM_UNKNOWN=skip-match
52 }
53
54 # sanity check on message size
55 :0 E
56 * > $SPAMCHECK_MAX_MESSAGE_SIZE
57 {
58   LOG="spamfilter:  skipping check because message size exceeds $SPAMCHECK_MAX_MESSAGE_SIZE bytes$NL"
59   :0 fw
60   |$FORMAIL -I"X-Spam: unknown (message larger than $SPAMCHECK_MAX_MESSAGE_SIZE bytes)"
61   SPAM_UNKNOWN=too-large
62 }
63
64 # now run the spamfilters
65 :0 E
66 {
67   # crm114
68   CRM_SPAM=UNKNOWN
69   CRM_SCORE=0
70   :0
71   * !SKIP_CRM ?? .
72   {
73     :0 fw
74     |$CRM114
75
76     :0
77     * ^X-CRM114-Status: \/[A-Z]+
78     { CRM_SPAM=$MATCH }
79
80     :0
81     * ^X-CRM114-Status: .+\([ ]*\/-?[.0-9]+
82     { CRM_SCORE=$MATCH }
83
84     LOG="crm114:      $CRM_SPAM/$CRM_SCORE$NL"
85   }
86
87   # spamassassin
88   SA_STATUS=Unknown
89   SA_SCORE=0
90   SA_TESTS=none
91   :0
92   * !SKIP_SA ?? .
93   {
94     :0 fw
95     |$SPAMC
96
97     :0
98     * ^X-Spam-Status: \/[A-Za-z]+
99     { SA_SPAM=$MATCH }
100
101     :0
102     * ^X-Spam-Status: .+score=\/-?[.0-9]+
103     { SA_SCORE=$MATCH }
104
105     :0
106     * ^X-Spam-Status: .+tests=\/[^ ]+
107     { SA_TESTS=$MATCH }
108
109     LOG="SA:          $SA_SPAM/$SA_SCORE/$SA_TESTS$NL"
110   }
111
112   ## CASE 0: crm114 is unsure/untrained
113   :0
114   * CRM_SPAM ?? UNSURE
115   {
116     # retrain as ham
117     :0
118     * ? perl -e "$SA_SCORE <= $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM || exit 1"
119     {
120       LOG="spamfilter:  scheduling crm114 retraining with HAM (score $SA_SCORE <= $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM)$NL"
121       :0 fw
122       |$FORMAIL -A "X-CRM114-Autotrain: ham, according to SA (score $SA_SCORE <= $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM)"
123       RETRAIN=ham
124     }
125
126     # retrain as spam
127     :0 E
128     * ? perl -e "$SA_SCORE > $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM || exit 1"
129     {
130       LOG="spamfilter:  scheduling crm114 retraining with SPAM (score $SA_SCORE > $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)$NL"
131       :0 fw
132       |$FORMAIL -A "X-CRM114-Autotrain: spam, according to SA (score $SA_SCORE > $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)"
133       RETRAIN=spam
134     }
135
136     # skip retraining if SA is not convinced
137     :0 E
138     {
139       LOG="spamfilter:  will not autotrain crm114 because SA is not convinced ($CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM <= $SA_SCORE < $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)$NL"
140       :0 fw
141       |$FORMAIL -A "X-CRM114-Autotrain: SA is unsure ($CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM <= $SA_SCORE < $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)"
142       SPAM_UNSURE=sa-unsure
143     }
144   }
145
146   ## CASE 1: disagreement, SA sees ham
147   :0 E
148   * CRM_SPAM ?? SPAM
149   * SA_SPAM ?? No
150   {
151     # SA is convincing, so retrain crm114
152     :0
153     * ? perl -e "$SA_SCORE <= $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_HAM || exit 1"
154     {
155       LOG="spamfilter:  crm114 found spam ($CRM_SCORE), but SA is more convincing ($SA_SCORE <= $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_HAM)$NL"
156       RETRAIN=ham
157       :0 fw
158       |$FORMAIL -A "X-CRM114-Retrain: ham, according to SA (score $SA_SCORE <= $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_HAM)"
159     }
160
161     # SA is not convincing, mark as disagreement
162     :0 E
163     {
164       LOG="spamfilter:  crm114 found spam ($CRM_SCORE), but SA thinks it's ham ($SA_SCORE)$NL"
165       SPAM_DISAGREE=sa-ham
166       :0 fw
167       |$FORMAIL -I "X-Spam: disagree (crm114:spam/$CRM_SCORE SA:ham/$SA_SCORE)"
168     }
169   }
170
171   ## CASE 1: disagreement, SA sees spam
172   :0 E
173   * CRM_SPAM ?? GOOD
174   * SA_SPAM ?? Yes
175   {
176     # SA is convincing, so retrain crm114
177     :0
178     * ? perl -e "$SA_SCORE > $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_SPAM || exit 1"
179     {
180       LOG="spamfilter:  crm114 found ham ($CRM_SCORE), but SA is more convincing ($SA_SCORE > $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_SPAM)$NL"
181       RETRAIN=spam
182       :0 fw
183       |$FORMAIL -A "X-CRM114-Retrain: spam, according to SA (score $SA_SCORE > $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_SPAM)"
184     }
185
186     # SA is not convincing, mark as disagreement
187     :0 E
188     {
189       LOG="spamfilter:  crm114 found ham ($CRM_SCORE), but SA thinks it's spam ($SA_SCORE)$NL"
190       SPAM_DISAGREE=sa-spam
191       :0 fw
192       |$FORMAIL -I "X-Spam: disagree (crm114:ham/$CRM_SCORE SA:spam/$SA_SCORE)"
193     }
194   }
195
196   :0 E
197   * CRM_SPAM ?? SPAM
198   * SA_SPAM ?? Yes
199   { 
200     IS_SPAM=sa+crm
201     :0 fw
202     |$FORMAIL -I"X-Spam: yes (crm114:$CRM_SCORE SA:$SA_SCORE)"
203   }
204
205   :0 Efw
206   |$FORMAIL -I"X-Spam: no (crm114:$CRM_SCORE SA:$SA_SCORE)"
207 }
208
209 # schedule spamtrapped ham for retraining as spam
210 :0
211 * SPAMTRAPPED ?? .
212 * ! IS_SPAM ?? .
213 { RETRAIN=spam }
214
215 INCLUDERC=$PMDIR/handlespam
216 #VERBOSE=no