From: martin f. krafft <madduck@madduck.net>
Date: Mon, 25 Feb 2008 15:58:35 +0000 (+0100)
Subject: add threshold values
X-Git-Url: https://git.madduck.net/etc/mailfilter.git/commitdiff_plain/45b624514190876194da3dcb2c0efe9ee3639407

add threshold values
---

diff --git a/crm114/mailfilter.cf b/crm114/mailfilter.cf
index 95006bd..f6e5bba 100644
--- a/crm114/mailfilter.cf
+++ b/crm114/mailfilter.cf
@@ -88,6 +88,11 @@
 #
 ##########################################################################
           
+#  ----------- define an optional target for where to send spam (that is,
+#   ------------ emails that we want to "fail", or reject to another
+#    ------------ address.  Note that this is NOT a "program fault" address,
+#     ------------ but where to send "bad" email to in the general case.
+#      ------------ You can specify tightly controlled conditions too,
 #  ----------- define an optional target for where to send spam,  
 #   ----------- To NOT forward this to another account, just leave the
 #    ----------- address as the empty string, which is '//'.
@@ -103,7 +108,8 @@
 
 #   -------- If you would prefer to send specific kinds of spam to 
 #    -------- different mailboxes, here's where to do it.
-#     ----------(be sure to uncomment the line!)
+#     -------- (be sure to uncomment the line!).  Again, these are
+#      --------- not "program fault" conditions, just different filter results.
 #
 # :fail_priority_mail_to:  /where_priority_fails_go/
 # :fail_blacklist_mail_to:  /where_blacklist_fails_go/
@@ -216,6 +222,10 @@
 :log_all_mail_to_file: //
 #:log_all_mail_to_file: /my_personal_mail_log_file_name.txt/
 
+#     --------- When we log messages, should we use a mail separator?
+#      --------- And, if so, what?
+:mail_separator: /\n-=-=-=-=-=-=- cut here -=-=-=-=-=-=-\n/
+
 #
 #     ---------- Message Cacheing for retraining - do we keep a cache of
 #    ---------- messages we've classified recently "in the wild" as retrain
@@ -259,8 +269,8 @@
 #            ------  options to shuffle.crm will work fine.  Alternatively,
 #             ------  you can use the "sort --key 1.2" on date-named files to
 #              -----   achieve chronological training
-:trainer_randomizer_command: / .\/shuffle.crm /
-#:trainer_randomizer_command: / .\/crm114_tre shuffle.crm /
+:trainer_randomizer_command: /.\/shuffle.crm/
+#:trainer_randomizer_command: /.\/crm114 shuffle.crm/
 #:trainer_randomizer_command: /sort --key 1.2/
 
 
@@ -366,24 +376,30 @@
 #
 #
 #
-
-# --------- If you are using thick-threshold training of any sort,
-# ---------- (for OSBF or otherwise) put the threshold here.
-# ----------- Remember, the polarity of the thick threshold value
-# ------------ is that anything scoring less than this value
-# ------------- should be considered a trainable error, and it "flips over"
-# -------------- so that both good mail and spam have the right symmetrical
-# ---------------- polarity.
-
-#     ------ a very small thick threshold (or zero!) works for Markovian.
-#:thick_threshold: /0/
-#:thick_threshold: /.001/
-#     ----- a thick threshold of 5 to 20 seems good for OSB, OSBF, 
-#     Hyperspace, Bit-Entropy, and Winnow
-#:thick_threshold: /5.0/
-:thick_threshold: /10.0/
-#:thick_threshold: /20.0/
+#     --------Thresholds for GOOD/UNSURE/SPAM thick-threshold training
+#     -------
+#     ------ A very small thick threshold (or zero!) works for Markovian.
+#     ----- A thick threshold of 5 to 20 seems good for OSB, OSBF, 
+#     ---- Hyperspace, Bit-Entropy, and Winnow.  If you want an asymmetric
+#     --- threshold system, you can do that by having :good_threshold:
+#     -- be different from :spam_threshold:.  The defaults are +/- 10.0
 #
+#
+#   ---- Things rated equal to or better than this are GOOD email
+#:good_threshold: /0.01/
+#:good_threshold: /5.0/
+:good_threshold: /10.0/
+#:good_threshold: /20.0/
+#
+#   ---- Things rated less than or equal to this are SPAM
+#:spam_threshold: /-0.01/
+#:spam_threshold: /-5.0/
+:spam_threshold: /-10.0/
+#:spam_threshold: /-20.0/
+  
+#   ---- mailfilter uses a single threshold and operates symmetrically.
+#   --- (this is only to provide backward compatibility)
+:thick_threshold: /10.0/
 
 #   ---- What regex do we use for LEARN/CLASSIFY?  the first is the
 #   ---- "old standard".  Other ones are handy for different spam
@@ -391,7 +407,7 @@
 #   ---- packed HTML spam, which is almost everybody in 2003, so it
 #   ---- used to be the default.  But since spammers have shifted away
 #   ---- from this, it isn't the default any longer.  IF you change
-#   ---- this, you MUST rebuild your .css files with roughly equal
+#   ---- this, you MUST rebuild your .css files with decent
 #   ---- amounts of locally-grown spam and nonspam ( if you've been
 #   ---- following instructions and using the "reaver" cache, this is
 #   ---- easily done! )