#!/bin/bash
##
## searches $1 recursivly for maildirs and finds duplicate mails
##
## stderr: currently processed directory
## stdout: duplicate mails in currently processed directory
##
## Two mails will be considered as duplicates, if
## a) their "Message-Id"s are identical and
## b) their bodies (everything below the first empty line) are identical.
##
## Copyright 2008-2009 Christian Schneider <software(at)chschneider(dot)eu>
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License version 3 as
## published by the Free Software Foundation, not any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program.  If not, see <http://www.gnu.org/licenses/>.
##
## WARNING: THIS IS ALPHA SOFTWARE AND MAY CONTAIN SERIOUS BUGS!

VERSION="20090422"

#######################################################################
## functions
#######################################################################

exit_error () {
  echo "ERROR: ${1}!" >&2
  exit 1
}

print_help () {
  echo "Usage: ${0##*/} [-v|--verbose] [-p|--pairs] [-s|--stats] <dir>"
  echo
  echo "Searches a directory for maildirs and prints duplicates of mails"
  echo "in each subdirectory of a maildir to stdout."
  echo
  echo "Two mails will be considered as duplicates, if"
  echo "  a) their \`Message-Id's are identical and"
  echo "  b) their bodies (everything below the first empty line) are"
  echo "     identical."
  echo
  echo "Optional parameters:"
  echo "  -v|--verbose  print currently processed directory (to stderr)"
  echo "  -p|--pairs    print duplicates as pairs, not just one of them"
  echo "  -s|--stats    print statistics for each directory (number of"
  echo "                duplicates/total number of mails) (to stderr)"
}

mailid () {
  awk '
    {
      if (toupper($1) == "MESSAGE-ID:") {
        if (NF < 2) {
          getline
          print $1
        }
        else {
          print $2
        }
        exit
      }
    }
  ' "${1}"
}

mailbody () {
  awk '
    BEGIN {
      OUTPUT = 0
    }

    {
      if (OUTPUT == 0) {
        if (NF == 0) OUTPUT = 1
      } else {
        print
      }
    }
  ' "${1}"
}

mailhash () {
  mailbody "${1}" | md5sum | cut -d' ' -f1
}

mailfilename () {
  F="$(echo "${1##*/}" | awk -F':' '{ print $1; }')"
  echo "${1%/*}/${F}"
}

mailsufflen () {
  echo "${1##*/}" | awk -F':' '{ print length($2); }'
}

maildups_simple () {
  ## separator
  SEP="$(echo -en '\007')"

  ## check for empty directory
  for I in "${1}"/*; do
    if [[ -e "${I}" ]]; then
      break
    else
      return
    fi
  done

  ## get message ids
  for I in "${1}"/*; do
    echo "$(mailid "${I}")${SEP}${I}"
  done | sort > "${TMPFILE_A}"

  ## delete second tmpfile
  > "${TMPFILE_B}"

  ## new field separator
  IFS_SAVED="${IFS}"
  IFS="${SEP}"

  ## temporary variables
  ID_PREV="${SEP}"
  FN_PREV=""
  ID_LAST_WRITTEN="${SEP}"
  N_MSG="0"

  ## search for identical message ids only
  while read ID FN; do
    ## check for identical message ids
    if [[ "${ID}" == "${ID_PREV}" ]]; then
      ## check if new set of identical messages are processed;
      ## if so, write entry for previous mail
      if [[ "${ID_LAST_WRITTEN}" != "${ID}" ]]; then
        echo "${ID_PREV}${SEP}$(mailhash "${FN_PREV}")${SEP}${FN_PREV}" >> \
          "${TMPFILE_B}"
      fi

      ## write entry for current mail
      echo "${ID}${SEP}$(mailhash "${FN}")${SEP}${FN}" >> "${TMPFILE_B}"
      ID_LAST_WRITTEN="${ID}"
    fi

    ## previous mail
    ID_PREV="${ID}"
    FN_PREV="${FN}"

    ## count messages
    (( N_MSG++ ))
  done <"${TMPFILE_A}"

  ## first tmpfile not required any longer
  sort "${TMPFILE_B}" > "${TMPFILE_A}"

  ## temporary variables
  ID_PREV="${SEP}"
  BD_PREV="${SEP}"
  FN_PREV=""
  N_DBL="0"

  ## search for identical message ids *and* bodies
  while read ID BD FN; do
    ## check for identical message ids and body hashes
    if [[ "${ID}" == "${ID_PREV}" && "${BD}" == "${BD_PREV}" ]]; then
      ## check for identical bodies (to be absolutely sure ...)
      if diff -q <(mailbody "${FN}") <(mailbody "${FN_PREV}") &>/dev/null; then
        ## sorting will lead to the wrong order, if, e.g., a file mail first
        ## has been seen (suffix "S") and later replied (suffix "RS") =>
        ## fix this here ...
        if [[ "$(mailfilename "${FN_PREV}")" == "$(mailfilename "${FN}")" ]]; then
          if (( "$(mailsufflen "${FN_PREV}")" > "$(mailsufflen "${FN}")" )); then
            ## swap ...
            FN_TMP="${FN}"
            FN="${FN_PREV}"
            FN_PREV="${FN_TMP}"
          fi
        fi

        ## output of mail file names
        if [[ "${PAIRS}" == "true" ]]; then
          echo "${FN_PREV} ${FN}"
        else
          echo "${FN_PREV}"
        fi

        ## count duplicates
        (( N_DBL++ ))
      fi
    fi

    ## previous mail
    ID_PREV="${ID}"
    BD_PREV="${BD}"
    FN_PREV="${FN}"
  done <"${TMPFILE_A}"

  ## restore old field separator
  IFS="${IFS_SAVED}"

  ## output number of messages in maildir
  if [[ "${STATS}" == "true" ]]; then
    echo "${1}  ( ${N_DBL} / ${N_MSG} )" >&2
  fi
}

maildups_recurse () {
  while read SUBDIR; do
    if [[ -d "${SUBDIR}" ]]; then
      if [[ -d "${SUBDIR}/cur" && -d "${SUBDIR}/new" && -d "${SUBDIR}/tmp" ]]; then
        for MDIR in cur new tmp; do
          if [[ "${VERBOSE}" == "true" ]]; then
            echo "Processing \"${SUBDIR}/${MDIR}\" ..." >&2
          fi
          maildups_simple "${SUBDIR}/${MDIR}"
        done
      else
        maildups_recurse "${SUBDIR}"
      fi
    fi
  done < <(shopt -s dotglob; for I in "${1}"/*; do echo "${I}"; done)
}

#######################################################################
## command line
#######################################################################

VERBOSE="false"
PAIRS="false"
STATS="false"
SDIR=""

if [[ "${#}" == "0" ]]; then
  print_help
  exit 1
fi

while (( ${#} > 0 )); do
  case "${1}" in
    -h|--help)
      print_help
      exit 0
      ;;
    -v|--verbose)
      VERBOSE="true"
      ;;
    -p|--pairs)
      PAIRS="true"
      ;;
    -s|--stats)
      STATS="true"
      ;;
    *)
      if [[ -z "${SDIR}" ]]; then
        SDIR="${1}"
      else
        exit_error "Too many parameters: \`${1}'!"
      fi
      ;;
  esac
  shift
done

#######################################################################
## main part
#######################################################################

## temporary file (FIXME: trap with rm command ...)
TMPFILE_A="$(mktemp -t maildups_a.XXXXXX)" || \
  exit_error "Unable to create temporary file"
TMPFILE_B="$(mktemp -t maildups_b.XXXXXX)" || \
  exit_error "Unable to create temporary file"

## find duplicated
maildups_recurse "${SDIR}"

## clean up
rm -f "${TMPFILE_A}" "${TMPFILE_B}"
