crm114/mailfilter.cf

   1 #  mailfilter.cf  -- Config file for mailfilter, mailreaver, mailtrainer
   2 #
   3 #    You MUST edit the fileds for "Secret Password", "mime decoder", and
   4 #    "cache_dupe_command".  Just those THREE things.
   5 #
   6 #     Changes to all other values are optional.
   7 #
   8 #    Many of the options here have two or three alternatives; for your
   9 #     convenience, we have put all of the reasonable alternatives
  10 #      on sequential lines.  Uncomment the one you want, and leave the
  11 #       others commented out.  If you leave more than one uncommented, the
  12 #       last one is the one that's used.  Don't do that; it's ugly.
  13 #
  14 #   After you edit this file, don't forget to edit 'rewrites.mfp'
  15
  16 #     --------->>>  You MUST set the following correctly! <<<-------
  17 #
  18 #    If you leave it as "DEFAULT-PASSWORD", you will not be able to
  19 #    access the mail-to-myself commanding system, as "DEFAULT-PASSWORD"
  20 #    is specifically _disabled_ as a legal password.  Just pick something, eh?
  21 #
  22 :spw: /DEFAULT_PASSWORD/
  23
  24 # ----- If you want a verbose startup, turn this on.  Note that this is
  25 #  ----- intentionally _after_ the password is set, so a verbose startup
  26 #   ----- will not reveal your password.
  27 #
  28 #:verbose_startup: /SET/
  29 :verbose_startup: //
  30
  31 #
  32 #     --------->>>  You MUST set the following correctly! <<<-------
  33 #
  34 #     --- Some mail systems do mime decoding with "mimencode -d" or "-u".
  35 #     --- Others (such as Red Hat 8.0) use "mewdecode" .
  36 #     --- Yet others (such as Fedora Core 3) use "openssl base64 -d" .
  37 #     --- Yet Others (i.e. *BSDs) can use "base64" .
  38 #     --- See which one is on your system and use that one- comment
  39 #     --- the others out.  If you can't figure out what your base64 mime
  40 #     --- decoder is, or don't want mime decoding, set :do_base64: to /no/
  41 #     --- but expect a significant accuracy decrease if you do this.
  42 #
  43 #:do_base64: /no/
  44 :do_base64: /yes/
  45 #
  46 #:mime_decoder: /mewdecode/
  47 #:mime_decoder: /mimencode -d/
  48 #:mime_decoder: /mimencode -u/
  49 #:mime_decoder: /base64 -d/
  50 :mime_decoder: /openssl base64 -d/
  51 #:mime_decoder: /normalizemime/
  52
  53
  54 #     --------->>>  You MUST set the following correctly! <<<-------
  55 #
  56 #    --- Linux (and Unix) systems use "hardlinks" to make a file
  57 #    --- appear in more than one place, while not actually using up
  58 #    --- extra disk space.  Sadly, it is the case that most
  59 #    --- Windows systems have no such feature.  So, you must set the
  60 #    --- following for what kind of system you are actually using.
  61 #    --  Note to other developers: here's where to put other system-dependent
  62 #    --  syscall commands.
  63 #
  64 #    --- Use the default /ln/ for LINUX and UNIX systems (does a hard-link,
  65 #    --- does not use up disk space or inodes).  Change this to the /copy/
  66 #    --- command for WINDOWS systems (95, 98, NT, XP)
  67 #
  68 #    --- Mild security issue: to avoid a theoretical exploit where a user
  69 #    --- gets their commands re-aliased, make sure you use the fully qualified
  70 #    --- commandname (that is, starting in the root directory).
  71 #
  72 :cache_dupe_command: /\/bin\/ln/
  73 #:cache_dupe_command: /copy/
  74
  75
  76
  77 ###########################################################################
  78 #
  79 #                END of things you absolutely MUST set.  Feel free
  80 #            to keep reading though...
  81 #
  82 ###########################################################################
  83
  84 ###########################################################################
  85 #
  86 #             START of things you might likely want to set.  These
  87 #            are probably OK for you, but many users change these things.
  88 #
  89 ##########################################################################
  90
  91 #  ----------- define an optional target for where to send spam (that is,
  92 #   ------------ emails that we want to "fail", or reject to another
  93 #    ------------ address.  Note that this is NOT a "program fault" address,
  94 #     ------------ but where to send "bad" email to in the general case.
  95 #      ------------ You can specify tightly controlled conditions too,
  96 #  ----------- define an optional target for where to send spam,
  97 #   ----------- To NOT forward this to another account, just leave the
  98 #    ----------- address as the empty string, which is '//'.
  99 #     ----------- This works fine especially if your mail reader program
 100 #      ----------- can sort based on the ADV and UNS (or whatever you choose
 101 #       ----------- to use as flagging strings) in the "Subject:" field.
 102 #     ------- CAUTION- some systems are buggy and _REQUIRE_ a user@host.domain
 103 #    ----- in the following to forward spammy mail correctly.  WTF??? :-(
 104 #
 105 #:general_fails_to: /somebody@somewhere.net/
 106 :general_fails_to: //
 107
 108
 109 #   -------- If you would prefer to send specific kinds of spam to
 110 #    -------- different mailboxes, here's where to do it.
 111 #     -------- (be sure to uncomment the line!).  Again, these are
 112 #      --------- not "program fault" conditions, just different filter results.
 113 #
 114 # :fail_priority_mail_to:  /where_priority_fails_go/
 115 # :fail_blacklist_mail_to:  /where_blacklist_fails_go/
 116 # :fail_SSM_mail_to:  /where_Classifier_fails_go_for_mailFILTER/
 117 # :fail_classify_mail_to: /where_classifier_fails_go_for_mailREAVER/
 118
 119
 120 #  ---------  Do we give nonspam, spam, and unsure an exitcode of 0
 121 #  ---------    (for most standalone apps) or something else?
 122 #  ---------     Usually we use an exit code of 1 for "program fault",
 123 #  ---------      but change it if you need to use 0/1 for good/spam
 124 #  ---------       Don't use an exit code greater than 128 (it breaks BASH!)
 125 #  ---------     If you use exit codes (procmail doesn't) change it here.
 126 :rejected_mail_exit_code: /0/
 127 :accepted_mail_exit_code: /0/
 128 :unsure_mail_exit_code: /0/
 129 :program_fault_exit_code: /1/
 130
 131 #######################################################################
 132 #
 133 #         END of things you are likely to want to change.
 134 #
 135 #         Anything following is starting to approach true customization.
 136 #        Feel free to explore and poke around.
 137 ######################################################################
 138
 139 # -----------Do we want to add the optional headers to the mail?
 140 # -----------If turned on, will add X-CRM114-Whatever: headers on each
 141 # -----------incoming email.  (note- this does NOT turn off the cache-id header
 142 #
 143 :add_headers: /yes/
 144 #:add_headers: /no/
 145
 146
 147 # ---------  do we add the statistics report?
 148 :add_verbose_stats: /no/
 149 #:add_verbose_stats: /no/
 150
 151
 152 # ---------  do we add the mailtrainer report to the top of the message body
 153 # ---------  after training?
 154 :add_mailtrainer_report: /no/
 155 #:add_mailtrainer_report: /no/
 156
 157
 158 #  ---------  Do we enable long-form explains (with lots of text)?
 159 #  -- you can have no extra stuff, add it as text, or add it as an attachment.
 160 #  -- (only available in mailfilter, not mailreaver)
 161 #
 162 :add_extra_stuff: /no/
 163 # :add_extra_stuff: /text/
 164 # :add_extra_stuff: /attachment/
 165
 166
 167 #  ---------  Do we prevent crm114 from adding multiple sfid tags?
 168 #  -- every time mailreaver or mailfilter process a message, they add an sfid
 169 #  -- tag to the Message-Id field, even if such a tag already exists.
 170 #  -- by setting the following, crm114 knows not to add the tag if one already
 171 #  -- exists.
 172 #
 173 :unique_sfid: /SET/
 174
 175
 176 #  ---------  Do we want to insert a "flagging" string on the subject line,
 177 #  ---------  perhaps to insert an 'ADV:'  ?  Whatever string we put here
 178 #  ---------  will be inserted at the front of the subject if we think the
 179 #  ---------  mail is spam.
 180 #
 181 # :spam_flag_subject_string: //
 182 :spam_flag_subject_string: //
 183
 184 #  ---------  Do we want to insert a "flagging" string on the subject line
 185 #  ---------  for good email?  Usually we don't.... so we set this to the
 186 #  ---------  null string - that is, //
 187 :good_flag_subject_string: //
 188
 189 #  ------------Similarly, do we want to insert a "flagging" string on
 190 #  -------------the subject line of an "unsure" email?  This way we know
 191 #  --------------we need to train it even if "headers" is turned off.
 192 # :unsure_flag_subject_string: //
 193 :unsure_flag_subject_string: //
 194
 195 # ------------- Do we want Training ConFirmation flags on the results of
 196 # ------------- a message to be learned?  Default is "TCF:".
 197 :confirm_flag_subject_string: //
 198 #:confirm_flag_subject_string: //
 199
 200
 201 # ---------  Do we want to do any "rewrites" to increase generality and
 202 #  ---------- (usually) accuracy?  IF 'yes', be sure to edit rewrites.mfp!
 203 #    --------- NOTE: this option is somewhat slow.  If your mailserver is
 204 #      --------- maxed out on CPU, you might want to turn this off.
 205 #
 206 :rewrites_enabled: /yes/
 207 #:rewrites_enabled: /no/
 208
 209
 210 #  ---------  Do we copy incoming text into allmail.txt ?  default is yes, but
 211 #   ---------  experienced users will probably set this to 'no' after testing
 212 #    ---------  their configuration for functionality.
 213 #
 214 :log_to_allmail.txt:  /no/
 215 # :log_to_allmail.txt: /no/
 216
 217 #   -------  Another logging option - log all mail to somewhere else
 218 #    -------  entirely.  Whatever pathname is given here will be prefixed
 219 #     -------  by :fileprefix:
 220 #      -------  To not use this, set it to the null string .. //
 221 #       -------  Remember to backslash-escape path slashes!
 222 :log_all_mail_to_file: //
 223 #:log_all_mail_to_file: /my_personal_mail_log_file_name.txt/
 224
 225 #     --------- When we log messages, should we use a mail separator?
 226 #      --------- And, if so, what?
 227 :mail_separator: /\n-=-=-=-=-=-=- cut here -=-=-=-=-=-=-\n/
 228
 229 #
 230 #     ---------- Message Cacheing for retraining - do we keep a cache of
 231 #    ---------- messages we've classified recently "in the wild" as retrain
 232 #   ---------- texts?  This uses up some disk space, but means that we can
 233 #  ---------- use mailtrainer.crm on these messages to autotune the classifier.
 234 # ---------- Default is to cache into the directory reaver_cache ;
 235 #  ---------- if you don't want this, set it to // .  If you don't use this,
 236 #   ---------- you can't really use mailtrainer.crm, and you must keep your
 237 #    ---------- headers scrupulously clean in all train messages.  Recommended
 238 #     ---------- to leave this unchanged unless you are VERY short of disk.
 239 #
 240 :text_cache: /../../../.var/crm114/reaver_cache/
 241 # :text_cache: //
 242
 243
 244 #   ----- How do we invoke the trainer (as in, just the invocation
 245 #   ------ of CRM114 on mailtrainer.crm.  Usually this is just obvious,
 246 #   ------- but if you don't have CRM114 installed in the search path, here's
 247 #   -------- where you can set trainer invocation to be via whatever path
 248 #   --------- you want it (the second example is if you haven't installed
 249 #   ---------- CRM114 at all, but are running the crm114_tre static binary
 250 #   ----------- right out of the local directory.)
 251 #
 252 #     -- use this next one if you have installed CRM114 with "make install"
 253 #     -- (This is preferred and is the default)
 254 :trainer_invoke_command: /\/usr\/share\/crm114\/mailtrainer.crm/
 255 #
 256 #     -- use this one if you can't do a "make install" and so must run the
 257 #     --- crm114_tre binary directly out of the current working directory.
 258 # :trainer_invoke_command: /.\/crm114_tre mailtrainer.crm /
 259
 260
 261 #    ------  If we're cacheing for retraining, we're probably using
 262 #     ------  mailtrainer.crm or some other variant.  In that case,
 263 #      ------  you will want a "randomizer" to present the training
 264 #       ------  examples to the classifier in some random but balanced order.
 265 #        ------  You have two choices - you can either use the "sort"
 266 #         ------  command on some random character in the filename (this
 267 #          ------  is NOT recommended) or use the "shuffle.crm" program.
 268 #           ------  We _strongly_ recommend using shuffle.crm; the default
 269 #            ------  options to shuffle.crm will work fine.  Alternatively,
 270 #             ------  you can use the "sort --key 1.2" on date-named files to
 271 #              -----   achieve chronological training
 272 :trainer_randomizer_command: /.\/shuffle.crm/
 273 #:trainer_randomizer_command: /.\/crm114 shuffle.crm/
 274 #:trainer_randomizer_command: /sort --key 1.2/
 275
 276
 277 #  ---------  Do we log rejected mail to a file?  default yes, but most
 278 #   ---------  users should set this to no after testing their
 279 #    ---------  configuration to verify that rejected mail goes to the
 280 #     ---------  reject address.  Only works in mailfilter.crm
 281 #
 282 :log_rejections: /yes/
 283 #:log_rejections: /no/
 284
 285 #  ------- alternate rejection logging - set this pathname to non-null
 286 #   ------  to log rejections elsewhere.  Only for mailreaver.crm.
 287 #    -----   Set to NULL ( // ) to turn this off.
 288 :log_rejections_to_file: //
 289 #:log_rejections_to_file /this_is_my_rejected_email_log_file.txt/
 290
 291
 292 #   ----------Do we want to enable "inoculation by email"?
 293 #   --------(leave this off unless you want RFC inoculations)
 294 #
 295 :inoculations_enabled: /no/
 296 #:inoculations_enabled: /yes/
 297
 298
 299 #  --------- How many characters of the input do we really trust to be
 300 #  ---------- worthy of classification?  Usually the first few thousand
 301 #  ----------- bytes of the message tell more than enough (though we've
 302 #  ------------ been "noticed" by spammers, who are now packing 4K of
 303 #  ------------- innocuous text into their headers.  No problemo... :) )
 304 #
 305 #:decision_length: /4096/
 306 #:decision_length: /64000/
 307 :decision_length: /16000/
 308 #  -----  for entropy users ONLY - 4K is plenty!
 309 #:decision_length: /4096/
 310
 311
 312
 313 #  ------------ Do we want to expand URLs (that is, fetching the contents
 314 #  ------------- of a URL and inserting that after the url itself?)
 315 #  -------------- By default this is off, but turn it on if you want
 316 #  --------------- to experiment.
 317 :expand_urls: /no/
 318 # :expand_urls: /yes/
 319 #
 320 #         WGET options - 30-second timeout, output to stdout.
 321 #         HACK - use the proper --user-agent="IEblahblah" for max effect!
 322 :url_fetch_cmd:  /wget -T 30 -O -  /
 323 #         and trim the URL text to not more than 16bytes of text.
 324 :url_trim_cmd:  / head -c 16000 /
 325
 326
 327 #######################################################################
 328 #
 329 #   -------------------  YOU REALLY SHOULD STOP HERE -------------------
 330 #   ---------  values below this line are usually OK for almost all
 331 #   ---------  users to use unchanged - Gurus only beyond this point.
 332 #
 333 #######################################################################
 334 #
 335 #   If you want to change things here, go ahead, but realize you are
 336 #   playing with things that can really hurt accuracy.
 337 #
 338 #   This being open source, if you don't *think* about changing it,
 339 #   what would be the use of it being open source?  That said, this
 340 #   _is_ open source- you break it, you get to keep _both_ pieces!
 341 #
 342 #
 343 #   ------------ CLF - The Classifier Flags ----------
 344 #
 345 #   ---------  Which classifier flags do we use?  Default for 20060101 has
 346 #   ---------  been changed to OSB UNIQUE MICROGROOM.
 347 #
 348 #   ---------  A null setting gets you straight Markovian, without
 349 #   ---------  microgrooming.   OSB uses less memory, is faster,
 350 #   ---------  and is usually more accurate.  Correlative matching is
 351 #   ---------  100x - 1000x slower, but can match anything (binaries,
 352 #   ---------  wide chars, unicode, virii, _anything_.  Winnow is a
 353 #   ---------  non-statistical learning classificer with very nice
 354 #   ---------  accuracy (up to 2x SBPH).  Hyperspace is a pseudogaussian
 355 #   ---------  KNN (K-nearest-neighbor) matcher.
 356 #
 357 #   ---------  This is also where we set whether to use microgrooming
 358 #   ---------  or Arne optimization (they're currently mutually exclusive).
 359 #   ---------  If you turn off microgrooming you get Arne optimization
 360 #   ---------  automatically.
 361 #
 362 #   ---------  If you _change_ this, you _must_ empty out your .css or
 363 #   ---------  .cow files and build fresh ones, because these
 364 #   ---------  classifiers do NOT use compatible data storage formats!
 365 #
 366 #:clf: /microgroom/
 367 #:clf: /osb/
 368 #:clf: /osb microgroom/
 369 :clf: /osb unique microgroom/
 370 #:clf: /correlate/
 371 #:clf: /winnow/
 372 #:clf: /osbf/
 373 #:clf: /osbf microgroom/
 374 #:clf: /hyperspace/
 375 #:clf: /hyperspace unique/
 376 #
 377 #
 378 #
 379 #     --------Thresholds for GOOD/UNSURE/SPAM thick-threshold training
 380 #     -------
 381 #     ------ A very small thick threshold (or zero!) works for Markovian.
 382 #     ----- A thick threshold of 5 to 20 seems good for OSB, OSBF,
 383 #     ---- Hyperspace, Bit-Entropy, and Winnow.  If you want an asymmetric
 384 #     --- threshold system, you can do that by having :good_threshold:
 385 #     -- be different from :spam_threshold:.  The defaults are +/- 10.0
 386 #
 387 #
 388 #   ---- Things rated equal to or better than this are GOOD email
 389 #:good_threshold: /0.01/
 390 #:good_threshold: /5.0/
 391 :good_threshold: /10.0/
 392 #:good_threshold: /20.0/
 393 #
 394 #   ---- Things rated less than or equal to this are SPAM
 395 #:spam_threshold: /-0.01/
 396 #:spam_threshold: /-5.0/
 397 :spam_threshold: /-10.0/
 398 #:spam_threshold: /-20.0/
 399
 400 #   ---- mailfilter uses a single threshold and operates symmetrically.
 401 #   --- (this is only to provide backward compatibility)
 402 :thick_threshold: /10.0/
 403
 404 #   ---- What regex do we use for LEARN/CLASSIFY?  the first is the
 405 #   ---- "old standard".  Other ones are handy for different spam
 406 #   ---- mixes.  The last one is for people who get a great deal of
 407 #   ---- packed HTML spam, which is almost everybody in 2003, so it
 408 #   ---- used to be the default.  But since spammers have shifted away
 409 #   ---- from this, it isn't the default any longer.  IF you change
 410 #   ---- this, you MUST rebuild your .css files with decent
 411 #   ---- amounts of locally-grown spam and nonspam ( if you've been
 412 #   ---- following instructions and using the "reaver" cache, this is
 413 #   ---- easily done! )
 414 #
 415 :lcr: /[[:graph:]]+/
 416 #:lcr: /[[:alnum:]]+/
 417 #:lcr: /[-.,:[:alnum:]]+/
 418 #:lcr: /[[:graph:]][-[:alnum:]]*[[:graph:]]?/
 419 #:lcr: /[[:graph:]][-.,:[:alnum:]]*[[:graph:]]?/
 420 #
 421 #  this next one is pretty incomprehensible, and probably wrong...
 422 #:lcr: /[[:print:]][/!?\#]?[-[[:alnum:]][[:punct:]]]*(?:[*'=;]|/?>|:/*)?
 423 #
 424 #
 425 #     Expansions for antispamming.  You almost _always_ want these on,
 426 #     unless you're debugging something really bizarre.
 427
 428 #  ---------  Do we enable spammus interruptus undo?
 429 :undo_interruptus: /no/
 430 #:undo_interruptus: /yes/
 431 #
 432 #
 433 #
 434 # ------------ HIGHLY EXPERIMENTAL - automatic training!
 435 #             enable this only if you really want to live VERY dangerously!
 436 #              "Do you feel lucky today, punk?  Well, do ya?"
 437 #
 438 :automatic_training: /no/
 439 #
 440 #       ---- if you are living dangerously and have turned on autotraining,
 441 #            you should also set the following to point to an address that
 442 #            will get read on a quick basis, becuause this is where autotrain
 443 #            verifications will go.
 444 #
 445 #:autotrain_address: /root/
 446
 447
 448 :datadir: /..\/..\/.var\/.crm114/