]> git.madduck.net Git - etc/mailfilter.git/blob - procmail/spamfilter

madduck's git repository

Every one of the projects in this repository is available at the canonical URL git://git.madduck.net/madduck/pub/<projectpath> — see each project's metadata for the exact URL.

All patches and comments are welcome. Please squash your changes to logical commits before using git-format-patch and git-send-email to patches@git.madduck.net. If you'd read over the Git project's submission guidelines and adhered to them, I'd be especially grateful.

SSH access, as well as push access can be individually arranged.

If you use my repositories frequently, consider adding the following snippet to ~/.gitconfig and using the third clone URL listed for each project:

[url "git://git.madduck.net/madduck/"]
  insteadOf = madduck:

initial checkin
[etc/mailfilter.git] / procmail / spamfilter
1 #!/usr/bin/procmail
2
3 #TODO: rewrite to use SPAM variable, and do not autotrain spam here, only ham
4
5 PMDIR=${PMDIR:-$HOME/.etc/mailfilter/procmail}
6
7 :0
8 * !PMVAR ?? .
9 {
10   # PMVAR is not defined, so we are being called as filter
11   # thus source the standard defines
12   INCLUDERC=$PMDIR/defines
13   # prevent feeding back to procmail and delete the leading From line
14   PROCMAIL='/bin/cat'
15   # and tell the fucking procmail piece-of-shit to continue to be a filter
16   DEFAULT='|$PROCMAIL'
17 }
18
19 #VERBOSE=yes
20
21 INCLUDERC=$PMDIR/pre-spam-cleanup
22
23 # no need to reprocess messages that went into a spamtrap
24 :0
25 * SPAMTRAPPED ?? .
26
27   LOG="spamfilter:  skipping checks for spamtrapped message$NL"
28   :0 fw
29   |$FORMAIL -I"X-Spam: spamtrapped"
30 }
31
32 # let earlier parts of the mailfilter cause bypassing the checks
33 :0 E
34 * SKIP_SPAMCHECKS ?? .
35
36   LOG="spamfilter:  skipping checks as requested: $SKIP_SPAMCHECKS$NL" 
37   :0 fw
38   |$FORMAIL -I"X-Spam: unknown (skip requested)"
39   SPAM_UNKNOWN=skip-requested
40 }
41
42 # honour skip-spamchecks to exclude certain messages from spam checks
43 # altogether
44 :0 EBH
45 * ? $EGREP -qif $CONF/skip-spamchecks
46 {
47   LOG="spamfilter:  skipping checks as per skip-spamchecks$NL"
48   :0 fw
49   |$FORMAIL -I"X-Spam: unknown (check skipped)"
50   SPAM_UNKNOWN=skip-match
51 }
52
53 # sanity check on message size
54 :0 E
55 * > $SPAMCHECK_MAX_MESSAGE_SIZE
56 {
57   LOG="spamfilter:  skipping check because message size exceeds $SPAMCHECK_MAX_MESSAGE_SIZE bytes$NL"
58   :0 fw
59   |$FORMAIL -I"X-Spam: unknown (message larger than $SPAMCHECK_MAX_MESSAGE_SIZE bytes)"
60   SPAM_UNKNOWN=too-large
61 }
62
63 # now run the spamfilters
64 :0 E
65 {
66   # crm114
67   CRM_SPAM=UNKNOWN
68   CRM_SCORE=0
69   :0
70   * !SKIP_CRM ?? .
71   {
72     :0 fw
73     |$CRM114
74
75     :0
76     * ^X-CRM114-Status: \/[A-Z]+
77     { CRM_SPAM=$MATCH }
78
79     :0
80     * ^X-CRM114-Status: .+\([ ]*\/-?[.0-9]+
81     { CRM_SCORE=$MATCH }
82
83     LOG="crm114:      $CRM_SPAM/$CRM_SCORE$NL"
84   }
85
86   # spamassassin
87   SA_STATUS=Unknown
88   SA_SCORE=0
89   SA_TESTS=none
90   :0
91   * !SKIP_SA ?? .
92   {
93     :0 fw
94     |$SPAMC
95
96     :0
97     * ^X-Spam-Status: \/[A-Za-z]+
98     { SA_SPAM=$MATCH }
99
100     :0
101     * ^X-Spam-Status: .+score=\/-?[.0-9]+
102     { SA_SCORE=$MATCH }
103
104     :0
105     * ^X-Spam-Status: .+tests=\/[^ ]+
106     { SA_TESTS=$MATCH }
107
108     LOG="SA:          $SA_SPAM/$SA_SCORE/$SA_TESTS$NL"
109   }
110
111   ## CASE 0: crm114 is unsure/untrained
112   :0
113   * CRM_SPAM ?? UNSURE
114   {
115     # retrain as ham
116     :0
117     * ? perl -e "$SA_SCORE <= $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM || exit 1"
118     {
119       LOG="spamfilter:  scheduling crm114 retraining with HAM (score $SA_SCORE <= $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM)$NL"
120       :0 fw
121       |$FORMAIL -A "X-CRM114-Autotrain: ham, according to SA (score $SA_SCORE <= $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM)"
122       CRM_RETRAIN=ham
123     }
124
125     # retrain as spam
126     :0 E
127     * ? perl -e "$SA_SCORE > $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM || exit 1"
128     {
129       LOG="spamfilter:  scheduling crm114 retraining with SPAM (score $SA_SCORE > $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)$NL"
130       :0 fw
131       |$FORMAIL -A "X-CRM114-Autotrain: spam, according to SA (score $SA_SCORE > $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)"
132       CRM_RETRAIN=spam
133     }
134
135     # skip retraining if SA is not convinced
136     :0 E
137     {
138       LOG="spamfilter:  will not autotrain crm114 because SA is not convinced ($CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM <= $SA_SCORE < $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)$NL"
139       :0 fw
140       |$FORMAIL -A "X-CRM114-Autotrain: SA is unsure ($CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM <= $SA_SCORE < $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)"
141       SPAM_UNSURE=sa-unsure
142     }
143   }
144
145   ## CASE 1: disagreement, SA sees ham
146   :0 E
147   * CRM_SPAM ?? SPAM
148   * SA_SPAM ?? No
149   {
150     # SA is convincing, so retrain crm114
151     :0
152     * ? perl -e "$SA_SCORE <= $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_HAM || exit 1"
153     {
154       LOG="spamfilter:  crm114 found spam ($CRM_SCORE), but SA is more convincing ($SA_SCORE <= $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_HAM)$NL"
155       CRM_RETRAIN=ham
156       :0 fw
157       |$FORMAIL -A "X-CRM114-Retrain: ham, according to SA (score $SA_SCORE <= $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_HAM)"
158     }
159
160     # SA is not convincing, mark as disagreement
161     :0 E
162     {
163       LOG="spamfilter:  crm114 found spam ($CRM_SCORE), but SA thinks it's ham ($SA_SCORE)$NL"
164       SPAM_DISAGREE=sa-ham
165       :0 fw
166       |$FORMAIL -I "X-Spam: disagree (crm114:spam/$CRM_SCORE SA:ham/$SA_SCORE)"
167     }
168   }
169
170   ## CASE 1: disagreement, SA sees spam
171   :0 E
172   * CRM_SPAM ?? GOOD
173   * SA_SPAM ?? Yes
174   {
175     # SA is convincing, so retrain crm114
176     :0
177     * ? perl -e "$SA_SCORE > $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_SPAM || exit 1"
178     {
179       LOG="spamfilter:  crm114 found ham ($CRM_SCORE), but SA is more convincing ($SA_SCORE > $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_SPAM)$NL"
180       CRM_RETRAIN=spam
181       :0 fw
182       |$FORMAIL -A "X-CRM114-Retrain: spam, according to SA (score $SA_SCORE > $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_SPAM)"
183     }
184
185     # SA is not convincing, mark as disagreement
186     :0 E
187     {
188       LOG="spamfilter:  crm114 found ham ($CRM_SCORE), but SA thinks it's spam ($SA_SCORE)$NL"
189       SPAM_DISAGREE=sa-spam
190       :0 fw
191       |$FORMAIL -I "X-Spam: disagree (crm114:ham/$CRM_SCORE SA:spam/$SA_SCORE)"
192     }
193   }
194
195   :0 E
196   * CRM_SPAM ?? SPAM
197   * SA_SPAM ?? Yes
198   { 
199     IS_SPAM=sa+crm
200     :0 fw
201     |$FORMAIL -I"X-Spam: yes (crm114:$CRM_SCORE SA:$SA_SCORE)"
202   }
203
204   :0 Efw
205   |$FORMAIL -I"X-Spam: no (crm114:$CRM_SCORE SA:$SA_SCORE)"
206 }
207
208 INCLUDERC=$PMDIR/handlespam
209 #VERBOSE=no