#!/usr/bin/perl -w ## anonymize filter: remove email addresses and last names ## with --test argument, run self-tests and ignore standard input ## with --quiet argument, send warnings only to log file, not to standard out # Michael Ernst # This filter removes email addresses and last names from its input # (typically a mail archive); full names and email addresses are replaced # by unique nicknames. For some more documentation, see # http://www.cf-web.org/cystic-l/anonymization.html. # Typical usage is to run this program twice. After the first run, the # user edits the .anon-alias file based on the diagnostic output in # anonymize.log; if any changes were made, then the program should be run a # second time to produce the desired result. Only one instance of this # script should be run at a time, because it writes a number of files when # it is finished. # This script operates differently on From: lines and all other lines: # * From: lines: # * if not recognized, add this address to all three arrays # * replace by semi-anonymized version # * All other lines: # * replace discovered fromlines, addresses, full names # * remove junk text # Customization files: # * .anon-alias consists of two-line entries. The first line of each # entry is a deprecated nickname; the second line is the preferred # nickname. After the program is run just once, the deprecated nickname # will no longer appear in any internal databases and the entry in # .anon-alias may be eliminated. Performing this alias translation is a # simple way to either change an alias or to merge two different aliases # that have been assigned to the same individual; in the latter case, it # is shorthand for specifying that each of the respective From: lines, # addresses, and names correspond to one another. That information about # which addresses (say) correspond to one another isn't explictly given. # It can be read out of the .anon-all file, however. When changing an # alias, take care that the new nickname isn't already in use; check file # .anon-all. (You can safely use the suggested nickname (in the "reusing" # message) if you haven't rerun this program (on any input) in the # meanwhile.) # * .anon-junk-texts consists of chunks of text separated by a line of 75 # equal signs. (Use cut and paste to get another such separator; for # best results, there should also be a separator at the end of the file.) # Occurrences of these texts are removed, even when quoted by another # message. This is a good way to eliminate annoyance text like # advertisements inserted by Juno and other free mail services. # * .anon-extra [NOT YET SUPPORTED] is a list of arbitrary texts to be # replaced by arbitrary other texts. It seems this might be useful. # For the time being, you can just put the arbitrary text, and its # replacement, in the .anon-name file. # * .anon-{fromline,address,name} maintain correspondences between # cleartext and anonymized versions of From: lines, addresses, and names. # Their format is two-line entries (first line is cleartext, second is # nickname). Generally, users will not edit these files. One reason to # edit them by hand would be to split apart multiple names/addresses/etc. # associated with the same nickname. There isn't an automated, easy way # to perform this uncommon operation; rather, one must go to each # occurrence of the nickname in any of these files and change it to the # new nickname or not, as appropriate. # These files are read by the program at startup, and updated versions # are written before the program terminates. If these files were # somehow lost, then it would probably be necessary to completely # re-anonymize the entire archive, lest when a name is encountered a # different nickname is chosen for it than was used on a preveious # anonymization run that used the information that had accumulated in # these files. Also, the association of various apparently-different # addresses/names with one another would be lost. # * .anon-all maps from nicknames to information about them. This file is # not read by the program; it is only produced for human consumption. # For each nickname, four lines of information are printed: # * list of From: lines # * list of addresses # * list of names # * list of other nicknames that map to this one # The different addresses that map to the same nickname can be read off # this list, and likewise for From: lines and names. For instance, just # search in this file for the address of interest. # Invariant: For every entry, either it has no aliases, or it has no # From: lines, addresses, or names. # * .anon-similar outputs entries that are suspiciously similar; the user # might wish to coalesce their nicknames. This file is not read by the # program; it is only produced for human consumption. (I should add an # additional file that lists pairs which should be suppressed from # output in this file, so the user only has to make a decision once # about each pair rather than being repeatedly confronted by particular # similar but different addresses or names.) ########################################################################### ### To do ### # To do: # * Produce output in .anon-similar for similar-looking names (currently # done only for addresses); add a way to prune output from that file. # * Add support for .anon-extra file # * Be able to anonymize names even if broken across lines. # * If a name is "John Doe", then perhaps warn of occurrences of "John # [A-Z]\.? Doe", and vice versa, so it can be added to .anon-names. # * Add a lockfile (perhaps at a higher level, say in make-{day,month}.pl) # to prevent multiple simultaneous runs. Or write to a unique logfile, # and append that into the main logfile only at the end (or, better, both # at write_output_file and later at end). # Ideas for improving efficiency (not sure whether these will help): # * Don't bother checking fromline, address unless there is an "@" in the # line/paragraph. # * Use index instead of regular expressions. # * Replace by something of the same length (say, pad with # ASCII 1) for faster string manipulation; then a postpass can strip out # the filler characters. [This didn't seem to help, at least when the # postprocessing stage wrote the output to a temporary file.] # * Use something like Emacs's regexp-opt. (If it only cuts time by a third, # is that really worth it?) ########################################################################### ### Preliminaries ### use checkargs; use util_mde; use Carp; use strict; use Mail::Address; my $quiet = 0; if ((scalar(@ARGV) == 2) && ($ARGV[0] eq "--test")) { open(TESTFILE, "<$ARGV[1]") || die "Couldn't open test input file $ARGV[1]"; exit test(*TESTFILE); } if ((scalar(@ARGV) == 1) && ($ARGV[0] eq "--test")) { exit test(*DATA); } if ((scalar(@ARGV) == 1) && ($ARGV[0] eq "--quiet")) { $quiet = 1; } elsif (scalar(@ARGV) != 0) { die "Use $0 as a filter (optional arg --quiet), or call with one arg --test"; } my $scripts_dir = "/www/scripts"; my $logfile = "$scripts_dir/anonymize.log"; if (!open (LOGFILE, "> $logfile-$$")) { print STDERR "WARNING! couldn't create $logfile-$$: $!\n"; } { my $date = `date`; chomp($date); # the command line print LOGFILE $date . ": $0 @ARGV\n"; } ########################################################################### ### Variables ### my %nicknames = (); # nickname to [fromlines,addresses,names,alias] # list, where those are references to arrays. # Use for debugging, making unique nicknames. # Contents are in mixed case. my $nn_fromline_index = 0; my $nn_address_index = 1; my $nn_name_index = 2; my $nn_alias_index = 3; my $nn_max_index = 3; my @nn_reverse_index = ("fromline", "address", "name", "alias"); # Mixed-case versions, for human-friendly output. my %mc_fromline_to_nickname = (); my %mc_address_to_nickname = (); my %mc_name_to_nickname = (); my %mc_nickname_alias = (); # Lowercase versions of the above, for comparison. my %lc_fromline_to_nickname = (); my %lc_address_to_nickname = (); my %lc_name_to_nickname = (); %mc_fromline_to_nickname = input_array("$scripts_dir/.anon-fromline", %mc_fromline_to_nickname); %mc_address_to_nickname = input_array("$scripts_dir/.anon-address", %mc_address_to_nickname); %mc_name_to_nickname = input_array("$scripts_dir/.anon-name", %mc_name_to_nickname); %mc_nickname_alias = input_array("$scripts_dir/.anon-alias", %mc_nickname_alias); { my ($old, $new); while (($old, $new) = each %mc_nickname_alias) { # Avoid reusing the old nickname for a different individual. add_nickname_maybe($old); push @{$nicknames{$old}[$nn_alias_index]}, $new; %mc_fromline_to_nickname = hash_value_replace($old, $new, %mc_fromline_to_nickname); %mc_address_to_nickname = hash_value_replace($old, $new, %mc_address_to_nickname); %mc_name_to_nickname = hash_value_replace($old, $new, %mc_name_to_nickname); } } %lc_fromline_to_nickname = lowercase_nickname_hash($nn_fromline_index, %mc_fromline_to_nickname); %lc_address_to_nickname = lowercase_nickname_hash($nn_address_index, %mc_address_to_nickname); %lc_name_to_nickname = lowercase_nickname_hash($nn_name_index, %mc_name_to_nickname); sub lowercase_nickname_hash ( $$ ) { my ($index, %mc_key_to_nickname) = check_args_at_least(1, @_); my ($key, $nick, %lc_key_to_nickname); while (($key, $nick) = each %mc_key_to_nickname) { my $lc_key = lc($key); if (defined($lc_key_to_nickname{$lc_key}) && ($lc_key_to_nickname{$lc_key} ne $nick)) { my $whicharray = $nn_reverse_index[$index]; my $msg = "In $whicharray, nickname $nick for $key\n" . " " x length($whicharray) ." but $lc_key_to_nickname{$lc_key} for $lc_key"; print LOGFILE "$msg\n"; copy_logfile(); die $msg; } $lc_key_to_nickname{$lc_key} = $nick; add_nickname_maybe($nick); push @{$nicknames{$nick}[$index]}, $key; } return %lc_key_to_nickname; } sub add_nickname_maybe ( $ ) { my ($nick) = check_args(1, @_); if (!exists($nicknames{$nick})) { # why doesn't this work? (Maybe it should be [[], [], [], [], []].) # $nicknames{$nick} = ([], [], [], [], []); $nicknames{$nick} = (); for (my $i=0; $i<=$nn_max_index; $i++) { $nicknames{$nick}[$i] = []; } } } # I was getting complaints where this was used in regexps. Fix later. # (Was this due to the @? Should I concatenate rather than interpolating?) # my $email_re = '[-a-z0-9._]+@[-a-z0-9._]+'; ########################################################################### undef $/; my $text = ; # whole file now here $/ = "\n"; # so chomp() will work # Remove junk/uninteresting text { # $/ is a string, not a regexp! my $junk_sep = "\n===========================================================================\n"; $/ = $junk_sep; if (open(INPUT, "<$scripts_dir/.anon-junk-texts")) { my $junk; while (defined($junk = ) && $junk) { $junk =~ s/$junk_sep$//; # print STDERR "Junk:\n$junk\n"; my $junk_re = quotemeta($junk); $junk_re =~ s/$/ */mg; $text =~ s/\n$junk_re\n/\n/gi; $junk_re =~ s/^/(> *)?/mg; $text =~ s/\n$junk_re\n/\n/gi; } close(INPUT); } $/ = "\n"; # Remove trailing quotation of blank lines. $text =~ s/(^>[ \t]*\n)+([^>])/$2/gm; } # Remove leading "CF:" and "[C-L]" from subject lines # (Do I need both the upcase and downcase versions of "cf:"?) $text =~ s/^(Subject:\s+((Re|Fw):\s*)*)(cf:|CF:|\[C-L\])\s*/$1/gim; # Anonymize From: lines, find new names $text =~ s/^(From:\s+)(.*)$/from_line_replacement($2,$1)/gem; # This comes early so that we can begin editing the files or start another # anonymize.pl process. But it must follow calls to from_line_replacement. write_output_files(); copy_logfile(); if (!open (LOGFILE, "> $logfile-$$")) { print STDERR "WARNING! couldn't create $logfile-$$: $!\n"; } # Anonymize other uses of name and/or address { # Don't do "study $text;" because I'll have to restudy after each change. my ($full, $short); # Be sure to try longer before shorter (ie, fromline before address or name). while (($full, $short) = each %lc_fromline_to_nickname) { # print STDERR "Fromline: $short $full\n"; my $full_re = quotemeta($full); $text =~ s/\b\Q$full\E\b/$short/gi; } while (($full, $short) = each %lc_address_to_nickname) { # print STDERR "Address: $short $full\n"; $text =~ s/?/$short/gi; } while (($full, $short) = each %lc_name_to_nickname) { # Do nothing if just one word, to avoid excessive replacement # for "Barney" or "Michael". if (($full =~ / /) # Also forbid certain strings (should generalize this) && ($full ne "Cystic Fibrosis")) { # print STDERR "Name: $short $full\n"; $text =~ s/\b\Q$full\E\b/$short/gi; } } } print $text; ########################################################################### copy_logfile(); exit(); ########################################################################### # Append $logfile-$$ to $logfile # This closes the currently open logfile, so the next action should be to # reopen LOGFILE or to exit. sub copy_logfile () { open(MAIN_LOGFILE, ">> $logfile") or die "Couldn't append to $logfile: $!"; open(LOGFILE, "$logfile-$$") or die "Couldn't read $logfile-$$: $!"; my $line; while (defined($line = )) { print MAIN_LOGFILE $line; } close(LOGFILE); close(MAIN_LOGFILE); unlink "$logfile-$$"; } # Write output files sub write_output_files () { # These come first in case of trouble. output_array("$scripts_dir/.anon-fromline", %mc_fromline_to_nickname); output_array("$scripts_dir/.anon-address", %mc_address_to_nickname); output_array("$scripts_dir/.anon-name", %mc_name_to_nickname); # Output similar-looking addresses. { my $outfile = "$scripts_dir/.anon-similar"; if (!open(OUTPUT,">$outfile")) { my $msg = "Couldn't output to $outfile: $!"; print LOGFILE "$msg\n"; copy_logfile(); die $msg; } my @sorted_keys = sort keys %lc_address_to_nickname; for (my $i = 0; $i < $#sorted_keys; $i++) { my $addr1 = $sorted_keys[$i]; my $addr2 = $sorted_keys[$i+1]; my $nick1 = $lc_address_to_nickname{$addr1}; my $nick2 = $lc_address_to_nickname{$addr2}; if ($nick1 eq $nick2) { next; } my $user1 = $addr1; my $user2 = $addr2; $user1 =~ s/@.*$//o; $user2 =~ s/@.*$//o; if (lc($user1) eq lc($user2)) { print OUTPUT "\n$addr1 \t$nick1\n$addr2 \t$nick2\n"; } } close(OUTPUT); } # Output all nickname info { my $outfile = "$scripts_dir/.anon-all"; if (!open(OUTPUT,">$outfile")) { my $msg = "Couldn't output to $outfile: $!"; print LOGFILE "$msg\n"; copy_logfile(); die $msg; } for my $nickname (sort keys %nicknames) { print OUTPUT "$nickname\n"; for (my $i=0; $i<=$nn_max_index; $i++) { # I'm getting "Can't use an undefined value as an ARRAY reference" # here, but I don't know why and can't reproduce it consistently. if (!defined($nickname)) { carp "Undefined nickname"; print OUTPUT " <<>>\n"; next; } if (!defined($nicknames{$nickname})) { carp "Undefined nicknames{$nickname}"; print OUTPUT " <<>>\n"; next; } if (!defined($nicknames{$nickname}[$i])) { carp "Undefined nicknames{$nickname}[$i] where $i = $nn_reverse_index[$i]"; print OUTPUT " <<>>\n"; next; } ## This might be overkill; I'm not sure. # if (!defined(@{$nicknames{$nickname}[$i]})) # { carp "Undefined \@{nicknames{$nickname}[$i]} where $i = $nn_reverse_index[$i]"; # print OUTPUT " <<>>\n"; # next; } print OUTPUT " @{$nicknames{$nickname}[$i]}\n"; } } close(OUTPUT); } # Fix permissions my $files = join(' ', "$scripts_dir/.anon-fromline", "$scripts_dir/.anon-address", "$scripts_dir/.anon-name", "$scripts_dir/.anon-similar", "$scripts_dir/.anon-all"); system_or_die("chgrp cf-web $files"); system_or_die("chmod g+rw $files"); system_or_die("chmod o-rw $files"); # If .log file hasn't yet been written, this is pretty useless. if (!$quiet) { print STDERR "Wrote .anon-* files\n"; } } ########################################################################### sub from_line_replacement ( $$ ) { my ($mc_fromline, $header) = check_args(2, @_); # print STDERR "from_line_replacement: `$header'`$from'\n"; if (!(defined($mc_fromline) && defined($header))) { croak "Bad arguments: from_line_replacement($mc_fromline, $header)"; } # $from was embedded in ^(From:\s+)(.*)$; the result adds that back on if ($mc_fromline !~ /[@<>]/) { # No atsign or <...>: problably not an email From: line $mc_fromline =~ s/^.* on behalf of (.*)$/$1/i; return "$header$mc_fromline"; } my $lc_fromline = lc($mc_fromline); if (exists($lc_fromline_to_nickname{$lc_fromline})) { return "$header$lc_fromline_to_nickname{$lc_fromline} <$lc_fromline_to_nickname{$lc_fromline}>"; } my ($mc_name,$mc_address) = addr_name_and_address($mc_fromline); if (!defined($mc_name)) { my $msg = "Why isn't name defined? $mc_fromline"; print LOGFILE "$msg\n"; copy_logfile(); die $msg; } if (!defined($mc_address)) { my $msg = "Why isn't address defined? $mc_fromline"; print LOGFILE "$msg\n"; warn $msg; } my $mc_clean_name = cleanup_name($mc_name); if (($mc_name eq "") or ($mc_address eq "") or ($mc_fromline eq "")) { my $msg = "Something is empty!\n name=\"$mc_name\"\n address=\"$mc_address\"\n fromline=\"$mc_fromline\""; print LOGFILE "$msg\n"; warn $msg; return "$header$mc_fromline"; } ## Old implementation that didn't use Mail::Address # my $address; # my $name; # # Only bother with lines of the form "From: Foo " or "From: foo@bar" # # or "From: foo@bar (Baz)"; most others are inside message body. # if ($from =~ /^(.*)\s+<(.*)>$/) # { $name = $1; # $name =~ s/\s+$//o; # strip leading spaces # $address = $2; } # elsif ($from =~ /^([^@ ]+)@([^@ ])+$/) # { $name = $1; # $address = $from; } # elsif ($from =~ /^((?:[^@ ]+)@(?:[^@ ])+) \((.*)\)$/) # { $name = $2; # $address = $1; } # else # { print $line; # goto DONE_WITH_FROM; } my $lc_name = lc($mc_name); my $lc_clean_name = lc($mc_clean_name); my $lc_address = lc($mc_address); # print STDERR "definedness: $lc_name $lc_name_to_nickname{$lc_name}\n $lc_address $lc_address_to_nickname{$lc_address}\n" if (defined($lc_name_to_nickname{$lc_clean_name}) && defined($lc_address_to_nickname{$lc_address}) && ($lc_name_to_nickname{$lc_clean_name} eq $lc_address_to_nickname{$lc_address})) { # Same person, different From: line format my $nickname = $lc_name_to_nickname{$lc_clean_name}; if ((!defined $mc_name_to_nickname{$mc_name}) && ($mc_name =~ / /)) { $lc_name_to_nickname{$lc_name} = $nickname; # possibly already set $mc_name_to_nickname{$mc_name} = $nickname; push @{$nicknames{$nickname}[$nn_name_index]}, $mc_name; } $mc_fromline_to_nickname{$mc_fromline} = $nickname; $lc_fromline_to_nickname{$lc_fromline} = $nickname; push @{$nicknames{$nickname}[$nn_fromline_index]}, $mc_fromline; return "$header$nickname <$nickname>"; } my $nickname = make_nickname($mc_clean_name); if (!defined($nickname) || !$nickname) { my $msg = "No nickname for `$mc_clean_name' `$mc_address': $mc_fromline"; print LOGFILE "$msg\n"; copy_logfile(); die $msg; } ## Check whether this address already does (or should) have a nickname. if (defined($lc_address_to_nickname{$lc_address}) && ($lc_address_to_nickname{$lc_address} ne $nickname)) { my $reused_nickname = $lc_address_to_nickname{$lc_address}; my $base = nickname_base($nickname); my $reused_base = nickname_base($reused_nickname); if (($base ne $reused_base) # No warning if old "PamR", new "Pam" && ($reused_base !~ /^$base[A-Z]$/)) { my $msg = "reusing nickname $reused_nickname for $lc_address rather than new nickname $nickname based on $mc_clean_name\n"; if (!$quiet) { print STDERR $msg; } print LOGFILE $msg; print LOGFILE "To reverse this decision, add to .anon-alias:\n"; print LOGFILE "$reused_nickname\n$nickname\n\n"; # Avoid reusing the new nickname for a different individual. add_nickname_maybe($nickname); push @{$nicknames{$nickname}[$nn_alias_index]}, $reused_nickname; } $nickname = $reused_nickname; } elsif (defined($lc_name_to_nickname{$lc_clean_name}) && ($lc_name_to_nickname{$lc_clean_name} ne $nickname)) { my $reused_nickname = $lc_name_to_nickname{$lc_clean_name}; if (nickname_base($nickname) ne nickname_base($reused_nickname)) { my $msg = "case mismatch: reusing nickname $reused_nickname rather than new nickname $nickname based on $mc_clean_name\n"; if (!$quiet) { print STDERR $msg; } print LOGFILE $msg; print LOGFILE "To reverse this decision, add to .anon-alias:\n"; print LOGFILE "$reused_nickname\n$nickname\n\n"; # Avoid reusing the new nickname for a different individual. add_nickname_maybe($nickname); push @{$nicknames{$nickname}[$nn_alias_index]}, $reused_nickname; } $nickname = $reused_nickname; } ## Install the new nickname. # Also need to put it in the %nicknames hash here. add_nickname_maybe($nickname); $mc_fromline_to_nickname{$mc_fromline} = $nickname; $lc_fromline_to_nickname{$lc_fromline} = $nickname; push @{$nicknames{$nickname}[$nn_fromline_index]}, $mc_fromline; $mc_address_to_nickname{$mc_address} = $nickname; $lc_address_to_nickname{$lc_address} = $nickname; push @{$nicknames{$nickname}[$nn_address_index]}, $mc_address; if ($mc_name =~ / /) { $mc_name_to_nickname{$mc_name} = $nickname; $lc_name_to_nickname{$lc_name} = $nickname; push @{$nicknames{$nickname}[$nn_name_index]}, $mc_name; if ($mc_clean_name ne $mc_name) { $mc_name_to_nickname{$mc_clean_name} = $nickname; push @{$nicknames{$nickname}[$nn_name_index]}, $mc_clean_name; if ($lc_clean_name ne $lc_name) { $lc_name_to_nickname{$lc_clean_name} = $nickname; } } } return "$header$nickname <$nickname>"; } ########################################################################### ### Name manipulation ### # Given a name extracted from an email address, convert it to standard # firstname lastname format. # Some of this is lifted from name and _extract_name of Mail::Address # (which is too buggy and ideosyncratic to use directly). sub cleanup_name ( $ ) { my ($name) = check_args(1, @_); # Punctuation $name =~ s/^\"(.*)\"$/$1/; # remove surrounding quotes $name =~ s/^\"//; # Remove leading quote, just in case $name =~ s/\A\(|\)\Z//g; # remove leading "(" or trailing ")" # I'm not sure this is always right; some of these (eg `') might be nicknames. $name =~ s/\(.*\)//g; # remove embedded comments $name =~ s/\[.*\]//g; # remove embedded comments $name =~ s/\`.*\'//g; # remove embedded comments $name =~ s/[\[\(].*$//; # remove unterminated comments $name =~ s/^[^A-Za-z]+//; # remove all leading punctuation (too much?) $name =~ s/[^A-Za-z.]+$//; # remove all trailing punctuation (too much?) # Mail::Address->parse($textaddr) introduces spaces after initials in # unquoted names: returns "Foo A . Bar" given "Foo A. Bar ". $name =~ s/\b([A-Z]) \. /$1. /g; ## Remove trailing gubbish # $name =~ s/[0-9_]+$//; # $name =~ s/[^a-zA-Z].*$//; # this removes too much $name =~ s/[0-9_\/@%].*$//; # remove trailing digits from email address # Comments: "JAMES R. TWINE - THE NERD" $name =~ s/(\s+-|:)\s+the.*//i; # change Barr, G. M. => G. M. Barr # first name must be initials, comma is optional $name =~ s/^([a-z]+(-[a-z]+)*)(\s*,\s*|\s+)(([a-z](\s+|\s*\.\s*))+)([^\sa-z]|\Z)/$3 $1/io; # change Buschle, Mark => Mark Buschle $name =~ s/^([a-z]+(-[a-z]+)*)\s*,\s*([-a-z\s.]+)([^-\sa-z.]|\Z)/$3 $1/io; # # remove leading/trailing punctuation # s/\A[^a-z]*(([a-z \-\.]|[a-z]\-[a-z])+).*/$1/igo; $name =~ s/[\.\s]+/ /go; # change . => ' ' and condense spaces ## Remove leading gubbish $name =~ s/^=\?ISO-8859-1\?Q\?//o; $name =~ s/^A ?HREF=\"mailto://o; $name =~ s/^by way of //o; $name =~ s/^[^a-zA-Z]+//; # remove leading non-letters $name =~ s/^((Dr|Mr|Ms|Mrs|Miss|The)+\.?\s+)+//i; # remove leading title $name =~ s/\bnet://; # Fix capitalization if all caps or all lowercase if ((($name =~ /^[^A-Z]+$/) || ($name =~ /^[^a-z]+$/)) # except do nothing if no spaces or vowels: probably initials && ($name =~ /[aeiou ]/i)) { $name =~ s/\b([a-z]+)/\L\u$1/igo; # Upcase first letter of name $name =~ s/\bMc([a-z])/Mc\u$1/igo; # Scottish names such as 'McLeod' $name =~ s/\b(x*(ix)?v*(iv)?i*)\b/\U$1/igo; # Roman numerals # eg 'Level III Support' # Roman numerals upto M (I think ?) # m*([cxi]m)?d*([cxi]d)?c*([xi]c)?l*([xi]l)?x*(ix)?v*(iv)?i* } $name =~ s/(\A\s+|\s+\Z)//go; # remove leading and trailing space $name =~ s/\s\s+/ /; # compress multiple spaces return $name; } # Assumption: the argument name has already been cleaned up, via cleanup_name. sub make_nickname ( $ ) { my ($name) = check_args(1, @_); # Assumption: the argument has already been cleaned up. # $name = cleanup_name($name); # # Strip leading last name, if first name follows after comma # $name =~ s/^[^,]*,\s+([A-Z][a-z][^,])$/$1/; # # Remove all but first name # # $name =~ s/\s.*$//; # $name =~ s/[^A-Za-z].*$//; # $name =~ s/^([A-Z])[A-Z][a-z]+$/$1/; # strip last name # $name =~ s/([a-z][a-z])[A-Z]$/$1/; # strip last initial # Remove trailing "Jr", etc. $name =~ s/(,\s*|,?\s+)(Jr|Sr|I|II|III|IV|Cf Rx)\.?$//i; # Remove anything after a comma which follows at least two words. $name =~ s/([A-Z][a-z]+\s[^,]*\b[A-Z][a-z]+),.*$/$1/i; # Remove leading initial, if multiple last names # (neither last name nor another initial follows leading initial) $name =~ s/^[A-Z]\.?\s+([A-Z][a-z].* )/$1/; # I should abstract out the last name and first name parts, as they # are really independent. # Determine last name, and strip it off. my ($last, $first, $nickname); if ($name !~ /\s/) # Single name: "Cher", "Prince", "Madonna" { $nickname = $name; } else { if (($name !~ /\band\b/) # Leading \b means that we must follow some other name. # Do replacement in case only initial precedes this: don't want to # reuse a "van" part as a name after a leading initial. && ($name =~ s/\b\s+(([a-z][a-z]+\s+)+)((\s*[A-Z][a-z]+)+)$//)) # Multiple last names, some lowercase: # "Stephen R. van den Berg", "Stephen R. van den Berg Hoffman" { $last = concat_initials($1) . concat_initials($3); } elsif ($name =~ s/(([A-Z][a-z]*)\s+(([a-z][. ]+)+))((\s*[A-Z][a-z]+)+)$/$1/) # multiple last names following a full name and an initial { $last = concat_initials($5); $name =~ s/\s+$//; } elsif ($name =~ s/\b.?\s+\b([A-Z][a-z]+(-[A-Z][a-z]+)+)$//) # Hyphenated last names { $last = $1; $last =~ s/-/ /; $last = concat_initials($last); $name =~ s/\s+$//; } elsif ($name =~ s/\b\.?\s+([a-z])(|\.|[a-z\']*)$//i) { $last = "\U$1"; } elsif ($name =~ s/\s+\b([a-z])[-a-z]*$//i) { $last = "\U$1"; } # # Handle "C P,Yates" # elsif ($name =~ s/([^\s])\s*\b([a-z][-a-z]+)$/$1/i) # { $last = "\U$1"; } elsif ($name =~ s/([a-z])[^a-z]+([a-z])[-a-z]*[^a-z]*$/$1/i) # This is a last resort; perhaps complain as well at this point { $last = "\U$2"; } else { my $msg = "Problem finding last name: $name"; print LOGFILE "$msg\n"; copy_logfile(); die $msg; } if ($name =~ s/\s+$//) { my $msg = "Trailing whitespace in name: `$name'"; print LOGFILE "$msg\n"; copy_logfile(); die $msg; } # Determine first name if ($name =~ /^(([a-z][. ]+)+).*\s*\b([a-z][a-z]+)/i) # Initials followed (eventually) by a name: "R. L. Michael", "R L Michael". { $first = concat_initials($3); } elsif ($name =~ /^((\b[a-z]\b[. ]*)+)$/i) # Only initials. { $first = concat_initials($name); } elsif ($name =~ /([^\s]*)\s/) # Ordinary name, possibly followed by middle name or initials. { $first = $1; } else { $first = $name; } $nickname = $first . $last; } # Add trailing number to differentiate { my $num = 1; while (exists($nicknames{$nickname . $num})) { $num++; } $nickname = $nickname . $num; } # print STDERR "make_nickname(@_) => $nickname\n"; return $nickname; } # Return a string consisting of the first letters of the words. # "X. Y. Z." => "XYZ" # "X Y Z" => "XYZ" # "abc def ghi" => "adg" sub concat_initials ( $ ) { my ($names) = check_args(1, @_); $names =~ s/^\s+//; $names =~ s/\s+$//; return join('', map { substr($_, 0, 1) } split(/[.\s]+/, $names)); } # Given a nickname, return it sans the trailing digits sub nickname_base ( $ ) { my ($nick) = check_args(1, @_); $nick =~ s/[0-9]+$//; return $nick; } # Like (Mail::Address->parse($textaddr))[0]->name, but doesn't massage the name. sub addr_name_and_address ( $ ) { my ($textaddr) = check_args(1, @_); # remove "by way of" comment $textaddr =~ s/\s+\(by way of[^\)]*([\)]|\Z)//; # remove extraneous comment between name and address $textaddr =~ s/(\"[^\"]*\"), INTERNET:([^\s]*)/$1 <$2>/; if ($textaddr =~ /^.* on behalf of (.*)$/) # Perhaps I can do better than this; I'm not sure. { return ($1,undef); } if ($textaddr =~ /^\"([^@\"]*)$/) { return ($1,undef); } if (($textaddr =~ /\s/) && ($textaddr !~ /[@<]/)) { return ($textaddr,undef); } if ($textaddr =~ /^(.*)<\/A>/oi) { return ($2,$1); } if ($textaddr =~ s/=\?iso-8859-1\?Q\?(.*)=?/$1/i) { # Lots more could be added here, no doubt. $textaddr =~ s/=E9/é/g; $textaddr =~ s/=F4/o/g; $textaddr =~ s/=F6/ö/g; } $textaddr =~ s/\[(SMTP|mailto):([-a-z0-9._]+@[-a-z0-9._]+)\]/<$2>/i; $textaddr =~ s/\[([-a-z0-9._]+@[-a-z0-9._]+)\]/<$1>/i; $textaddr =~ s/,(\s+and)/$1/; $textaddr =~ s/\s+[0-3]?[0-9][- ](jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[- ](199|200)[0-9]( [0-9][0-9]:[0-9][0-9]:[0-9][0-9]\.[0-9][0-9])?$//i; # print STDERR "textaddr: $textaddr\n"; $textaddr =~ s/^([^@]+)(, |\s*>\s*)(INTERNET:)?([-a-z0-9._]+@[-a-z0-9._]+)$/$1 <$4>/oi; # print STDERR "textaddr: $textaddr\n"; $textaddr =~ s/^[A-Z]+%\"([-a-z0-9._]+@[-a-z0-9._]+)\"$/$1/i; $textaddr =~ s/^[A-Z]+%\"([-a-z0-9._]+@[-a-z0-9._]+)\"\s+(\".*\")$/$2 <$1>/i; $textaddr =~ s/<([-a-z0-9._]+@[-a-z0-9._]+)>.*Knysna.*$/<$1>/i; # total hack, yuck # @maddrs is a list of extracted Mail::Address objects. my @maddrs = Mail::Address->parse($textaddr); if (scalar(@maddrs) != 1) { my $errmsg = "Wrong return length " . scalar(@maddrs) . " from Mail::Address->parse($textaddr)"; for my $maddr (@maddrs) { $errmsg .= "\n " . $maddr->format; } print LOGFILE "$errmsg\n"; # croak $errmsg; carp $errmsg; } my $maddr = $maddrs[0]; my $address = $maddr->address; if (!defined($address)) { my $msg = "undefined email address in parsed address $textaddr"; print LOGFILE "$msg\n"; carp $msg; } my $name = $maddr->phrase; if (defined($name) && length($name) == 0) { undef $name; } if (!defined($name)) { $name = $maddr->comment; if (defined($name) && length($name) == 0) { undef $name; } } my $addr = $maddr->address; if ((!defined($name) || ($name =~ /^[^. ]*@/)) && ($addr =~ /([^\%\.\@\_]+([\.\_][^\%\.\@\_]+)+)[\@\%]/o)) { # first.last@domain address ($name = $1) =~ s/[\.\_]+/ /go; } if (!defined($name)) { if ($addr =~ m/\/g=/oi) { # X400 style address my ($f) = $addr =~ m#g=([^/]*)#oi; my ($l) = $addr =~ m#s=([^/]*)#io; $name = "$f $l"; } else { $name = $addr; $name =~ s/@.*$//; } } # return length($name) ? $name : undef; return ($name,$address); } ########################################################################### ### Array I/O ### # The lines of the file representation are alternating keys and values. # Second argument is an array to modify. sub input_array ($%) { my ($file, %array) = check_args_at_least(1, @_); if (open(INPUT,$file)) { my $key; while (defined($key = ) && $key) { my $value = ; if (!defined($value)) { my $msg = "bad input file has odd length: no value for $key"; print LOGFILE "$msg\n"; copy_logfile(); die $msg; } chomp($key); chomp($value); if (($key eq "") || ($value eq "")) { my $msg = "input_array: empty key or value: <<$key>> <<$value>>"; print LOGFILE "$msg\n"; carp $msg; } else { $array{$key} = $value; } } close(INPUT); } return %array; } sub output_array ($%) { my ($outfile, %array) = check_args_at_least(1, @_); if (-f $outfile) { # This first rename is for debugging, or if the program is run twice # (automatically), ie, once after an error of some sort. Don't bother # checking the result of the first call. if (-f "$outfile.bak") { rename("$outfile.bak", "$outfile.bak.bak"); } if (!rename($outfile, "$outfile.bak")) { my $msg = "can't rename $outfile to $outfile.bak: $!"; print LOGFILE "$msg\n"; copy_logfile(); die $msg; } } if (!open(OUTPUT,">$outfile")) { my $msg = "Couldn't output to $outfile: $!"; print LOGFILE "$msg\n"; copy_logfile(); die $msg; } for my $key (sort keys %array) { if (($key =~ /\n/) || ($array{$key} =~ /\n/)) { croak "Newline in key or value: <<$key>> <<$array{$key}>>"; $key =~ s/\n/ /; $array{$key} =~ s/\n/ /; } if (($key eq "") || ($array{$key} eq "")) { my $msg = "output_array: empty key or value: <<$key>> <<$array{$key}>>"; print LOGFILE "$msg\n"; carp $msg; } else { print OUTPUT "$key\n$array{$key}\n"; } } close(OUTPUT); } ########################################################################### ### Testing ### sub test ( $ ) { my ($fh) = check_args(1, @_); my $errors = 0; my @lines; chomp(@lines = <$fh>); # doesn't work print "1..",scalar(@lines),"\n"; my $i = 1; foreach my $ln (@lines) { next unless($ln =~ /\S/); my ($test,$nick) = (split(/\t/,$ln),""); my ($name,$address) = addr_name_and_address($test); my $cleaned = cleanup_name($name); my $enick = make_nickname($cleaned); print "$test\n $name\n $cleaned\n $enick\n"; # print " name: $name\n clean: $cleaned\n nick: $enick\n"; if($enick eq $nick) { # print "ok ",$i,"\n"; } else { # print "not ok ",$i,"\n"; # warn "\n" . $test . "\n" . $eformat . "\n" . $ename. "\n" . $format . "\n" . $name . "\n\n"; } $i++; } return $errors; } __DATA__ "Joe & J. Harvey" , JJV @ BBN "Joe & J. Harvey" JJV @ BBN "spickett@tiac.net" rls@intgp8.ih.att.com (-Schieve,R.L.) bodg fred@tiuk.ti.com m-sterni@mars.dsv.su.se jrh%cup.portal.com@portal.unix.portal.com astrachan@austlcm.sps.mot.com ('paul astrachan/xvt3') TWINE57%SDELVB.decnet@SNYBUFVA.CS.SNYBUF.EDU (JAMES R. TWINE - THE NERD) David Apfelbaum "JAMES R. TWINE - THE NERD" bilsby@signal.dra (Fred C. M. Bilsby) /G=Owen/S=Smith/O=SJ-Research/ADMD=INTERSPAN/C=GB/@mhs-relay.ac.uk apardon@rc1.vub.ac.be (Antoon Pardon) "Stephen Burke, Liverpool" Andy Duplain Gunnar Zoetl The Newcastle Info-Server wsinda@nl.tue.win.info (Dick Alstein) mserv@rusmv1.rus.uni-stuttgart.de (RUS Mail Server) Suba.Peddada@eng.sun.com (Suba Peddada [CONTRACTOR]) ftpmail-adm@info2.rus.uni-stuttgart.de Paul Manser (0032 memo) "gregg (g.) woodcock" Clive Bittlestone Graham.Barr@tiuk.ti.com "Graham Bisset, UK Net Support, +44 224 728109" a909937 (Graham Barr (0004 bodg)) a909062@node_cb83.node_cb83 (Colin x Maytum (0013 bro5)) a909062@node_cb83.node_cb83 (Colin Maytum (0013 bro5)) fred@john (Level iii support) Derek.Roskell%dero@msg.ti.com ":sysmail"@ Some-Group. Some-Org, Muhammed.(I am the greatest) Ali @(the)Vegas.WBA david d `zoo' zuhn "Christopher S. Arthur" Jeffrey A Law lidl@uunet.uu.net (Kurt J. Lidl) Kresten_Thorup@NeXT.COM (Kresten Krab Thorup) hjl@nynexst.com (H.J. Lu) berg@POOL.Informatik.RWTH-Aachen.DE (Stephen R. van den Berg) @oleane.net:hugues@afp.com a!b@c.d foo!bar!foobar!root THX3andMe "Alan Edmonds (by way of Ron Trueworthy )" Alan W. Sponseller Dave & Karen