#!/usr/bin/procmail

#TODO: rewrite to use SPAM variable, and do not autotrain spam here, only ham

PMDIR=${PMDIR:-$HOME/.etc/mailfilter/procmail}

:0
* !PMVAR ?? .
{
  # PMVAR is not defined, so we are being called as filter
  # thus source the standard defines
  INCLUDERC=$PMDIR/defines
  # prevent feeding back to procmail and delete the leading From line
  PROCMAIL='/bin/cat'
  # and tell the fucking procmail piece-of-shit to continue to be a filter
  DEFAULT='|$PROCMAIL'
}

#VERBOSE=yes

# no need to reprocess messages that went into a spamtrap
# UPDATE: retrain them only if diagnosed as non-spam, see below
# Note: add E flag to next recipe when uncommenting
#:0
#* SPAMTRAPPED ?? .
#{
#  LOG="spamfilter:  skipping checks for spamtrapped message$NL"
#  :0 fw
#  |$FORMAIL -I"X-Spam: spamtrapped"
#}

# check whether this message is being resubmitted
:0
*$ $MSG_DEJAVU
{
  :0
  * TRAINED_AS ?? .
  {
    LOG="spamfilter:  skipping already trained $TRAINED_AS$NL"
    :0 fw
    |$FORMAIL -I"X-Spam: $TRAINED_AS (already trained)"
    :0
    * TRAINED_AS ?? spam
    { IS_SPAM=already-trained }
  }

  :0 E
  {
    LOG="spamfilter:  skipping resubmitted message$NL"
    :0 fw
    |$FORMAIL -I"X-Spam: unknown (resubmitted)"
  }
}

# do not run spamfilters if the message destination is already set
:0 E
* DEST ?? .
{
  LOG="spamfilter:  message already routed to '$DEST'$NL"
  :0 fw
  |$FORMAIL -I"X-Spam: unknown (already routed)"
  SPAM_UNKNOWN=already-destined
}

# let earlier parts of the mailfilter cause bypassing the checks
:0 E
* SKIP_SPAMCHECKS ?? .
{
  LOG="spamfilter:  skipping checks as requested: $SKIP_SPAMCHECKS$NL"
  :0 fw
  |$FORMAIL -I"X-Spam: unknown (skip requested)"
  SPAM_UNKNOWN=skip-requested
}

# honour skip-spamchecks to exclude certain messages from spam checks
# altogether
:0 EBH
* ? $EGREP -qif $CONF/skip-spamchecks
{
  LOG="spamfilter:  skipping checks as per skip-spamchecks$NL"
  :0 fw
  |$FORMAIL -I"X-Spam: unknown (check skipped)"
  SPAM_UNKNOWN=skip-match
  SKIP_SPAMCHECKS=match
}

# sanity check on message size
:0 E
* > $SPAMCHECK_MAX_MESSAGE_SIZE
{
  LOG="spamfilter:  skipping check because message size exceeds $SPAMCHECK_MAX_MESSAGE_SIZE bytes$NL"
  :0 fw
  |$FORMAIL -I"X-Spam: unknown (message larger than $SPAMCHECK_MAX_MESSAGE_SIZE bytes)"
  SPAM_UNKNOWN=too-large
}

# now run the spamfilters
:0 E
{
  INCLUDERC=$PMDIR/spamtraps
  INCLUDERC=$PMDIR/spammers
  INCLUDERC=$PMDIR/spampat
  INCLUDERC=$PMDIR/pre-spam-cleanup

  # crm114
  CRM_SPAM=UNKNOWN
  CRM_SCORE=0
  :0
  * !SKIP_CRM ?? .
  {
    #TODO: somehow filter out headers we added
    :0 fw
    |$CRM114

    :0
    * ^X-CRM114-Status: \/[A-Z]+
    { CRM_SPAM=$MATCH }

    :0
    * ^X-CRM114-Status: .+\([ ]*\/-?[.0-9]+
    { CRM_SCORE=$MATCH }

    LOG="crm114:      $CRM_SPAM/$CRM_SCORE$NL"
  }

  # spamassassin
  SA_STATUS=Unknown
  SA_SCORE=0
  SA_TESTS=none
  :0
  * !SKIP_SA ?? .
  {
    :0 fw
    |$SPAMC

    :0
    * ^X-Spam-Status: \/[A-Za-z]+
    { SA_SPAM=$MATCH }

    :0
    * ^X-Spam-Status: .+score=\/-?[.0-9]+
    { SA_SCORE=$MATCH }

    :0
    * ^X-Spam-Status: .+tests=\/[^ ]+
    { SA_TESTS=$MATCH }

    LOG="SA:          $SA_SPAM/$SA_SCORE/$SA_TESTS$NL"
  }

  ## CASE 0: crm114 is unsure/untrained
  :0
  * CRM_SPAM ?? UNSURE
  {
    # retrain spamtrapped message
    :0
    * SPAMTRAPPED ?? .
    {
      LOG="spamfilter:  scheduling retraining with SPAM due to spamtrap$NL"
      :0 fw
      |$FORMAIL -A "X-CRM114-Autotrain: spam, due to spamtrap"
      RETRAIN=spam
    }

    # retrain as ham
    :0 E
    * ? perl -e "$SA_SCORE <= $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM || exit 1"
    {
      LOG="spamfilter:  scheduling retraining with HAM (score $SA_SCORE <= $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM)$NL"
      :0 fw
      |$FORMAIL -A "X-CRM114-Autotrain: ham, according to SA (score $SA_SCORE <= $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM)"
      RETRAIN=ham
    }

    # retrain as spam
    :0 E
    * 1^0 ? perl -e "$SA_SCORE > $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM || exit 1"
    {
      LOG="spamfilter:  scheduling retraining with SPAM (score $SA_SCORE > $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)$NL"
      :0 fw
      |$FORMAIL -A "X-CRM114-Autotrain: spam, according to SA (score $SA_SCORE > $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)"
      RETRAIN=spam
    }

    # skip retraining if SA is not convinced
    :0 E
    {
      LOG="spamfilter:  will not autotrain crm114 because SA is not convinced ($CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM <= $SA_SCORE < $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)$NL"
      :0 fw
      |$FORMAIL -A "X-CRM114-Autotrain: SA is unsure ($CRM_UNSURE_SA_AUTOTRAIN_LIMIT_HAM <= $SA_SCORE < $CRM_UNSURE_SA_AUTOTRAIN_LIMIT_SPAM)"
      SPAM_UNSURE=sa-unsure
    }
  }

  ## CASE 1: disagreement, SA sees ham
  :0 E
  * CRM_SPAM ?? SPAM
  * SA_SPAM ?? No
  {
    # message was spamtrapped anyway
    :0
    * SPAMTRAPPED ?? .
    {
      LOG="spamfilter:  resolving crm114/SA disagreement due to spamtrap ($CRM_SCORE/$SA_SCORE)$NL"
      RETRAIN=spam
      :0 fw
      |$FORMAIL -A "X-CRM114-Retrain: spam, due to spamtrap"
    }

    # SA is convincing, so retrain crm114
    :0 E
    * ? perl -e "$SA_SCORE <= $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_HAM || exit 1"
    {
      LOG="spamfilter:  crm114 found spam ($CRM_SCORE), but SA is more convincing ($SA_SCORE <= $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_HAM)$NL"
      RETRAIN=ham
      :0 fw
      |$FORMAIL -A "X-CRM114-Retrain: ham, according to SA (score $SA_SCORE <= $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_HAM)"
    }

    # SA is not convincing, mark as disagreement
    :0 E
    {
      LOG="spamfilter:  crm114 found spam ($CRM_SCORE), but SA thinks it's ham ($SA_SCORE)$NL"
      SPAM_DISAGREE=sa-ham
      :0 fw
      |$FORMAIL -I "X-Spam: disagree (crm114:spam/$CRM_SCORE SA:ham/$SA_SCORE)"
    }
  }

  ## CASE 1: disagreement, SA sees spam
  :0 E
  * CRM_SPAM ?? GOOD
  * SA_SPAM ?? Yes
  {
    # message was spamtrapped anyway
    :0
    * SPAMTRAPPED ?? .
    {
      LOG="spamfilter:  resolving crm114/SA disagreement due to spamtrap ($CRM_SCORE/$SA_SCORE)$NL"
      RETRAIN=spam
      :0 fw
      |$FORMAIL -A "X-CRM114-Retrain: spam, due to spamtrap"
    }

    # SA is convincing, so retrain crm114
    :0
    * ? perl -e "$SA_SCORE > $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_SPAM || exit 1"
    {
      LOG="spamfilter:  crm114 found ham ($CRM_SCORE), but SA is more convincing ($SA_SCORE > $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_SPAM)$NL"
      RETRAIN=spam
      :0 fw
      |$FORMAIL -A "X-CRM114-Retrain: spam, according to SA (score $SA_SCORE > $CRM_MISCLASSIFY_SA_AUTOTRAIN_LIMIT_SPAM)"
    }

    # SA is not convincing, mark as disagreement
    :0 E
    {
      LOG="spamfilter:  crm114 found ham ($CRM_SCORE), but SA thinks it's spam ($SA_SCORE)$NL"
      SPAM_DISAGREE=sa-spam
      :0 fw
      |$FORMAIL -I "X-Spam: disagree (crm114:ham/$CRM_SCORE SA:spam/$SA_SCORE)"
    }
  }

  :0 E
  * CRM_SPAM ?? SPAM
  * SA_SPAM ?? Yes
  {
    IS_SPAM=sa+crm
    :0 fw
    |$FORMAIL -I"X-Spam: yes (crm114:$CRM_SCORE SA:$SA_SCORE)"
  }

  :0 Efw
  |$FORMAIL -I"X-Spam: no (crm114:$CRM_SCORE SA:$SA_SCORE)"
}

# schedule spamtrapped ham for retraining as spam
:0
* SPAMTRAPPED ?? .
* ! SKIP_SPAMCHECKS ?? .
* ! IS_SPAM ?? .
{
  LOG="spamfilter:  found spamtrapped ham, retraining...$NL"
  :0 fw
  |$FORMAIL -I"X-Spam: spamtrapped ham"
  IS_SPAM=spamtrapped-ham
  RETRAIN=spam
  SPAM_UNSURE
}

#VERBOSE=no