# Copyright 2009 Alexandra Klepatsch # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # facileOCR version: 0.5 package facileOCR; use strict; use Mail::SpamAssassin; use Mail::SpamAssassin::Util; use Mail::SpamAssassin::Plugin; use File::Temp qw/ tempfile tempdir /; our @ISA = qw (Mail::SpamAssassin::Plugin); my $maxsize = 50; # constructor: register the eval rule sub new { my ( $class, $mailsa ) = @_; $class = ref($class) || $class; my $self = $class->SUPER::new($mailsa); $self->get_config($mailsa->{conf}); bless( $self, $class ); $self->register_eval_rule("check_ocr"); return $self; } sub get_config { # whatever was set in config, default values otherwise my ($self, $conf, $pms ) = @_; my @opts = (); push(@opts, { setting => 'spamwords', is_admin => 1, default => "viagra,cialis", type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING }); push(@opts, { setting => 'debuglog', is_admin => 1, default => "off", type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING }); push(@opts, { setting => 'ocr_profile', is_admin => 1, default => 1, type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC }); push(@opts, { setting => 'max_img_size', is_admin => 1, code => sub { my ($self, $key, $value, $line) = @_; # avoid unrealistic or wrong values if ($value ne "") { $value =~ s/.*[^\d].*/-1/g; if ($value > 100 or $value < 10) { if ($value > 100) {$value = 100;} elsif ($value < 10) {$value = 10;} else {$value = 50;} } else {$maxsize = $value;} } } }); $conf->{parser}->register_commands(\@opts); } sub check_ocr # call the other subs, give the score { my ( $self, $pms ) = @_; my $dbg = $pms->{main}->{conf}->{debuglog}; debuglog($dbg, "start with tests"); my ($imgTextOcr, $numImages) = imageExtractionFromMSG($dbg, $pms->{msg}, $pms->{main}->{conf}); my $cnt = 0; my ($configwords, @words); if ($numImages > 0 && $imgTextOcr ne "") { debuglog($dbg, "$numImages img found, imgTextOcr contains:\n $imgTextOcr\n"); ($configwords = $pms->{main}->{conf}->{spamwords}) =~ s/[&*@\\\/`!{}\[\]:\^\$"='<>()%;~|.+?]//gi; @words = split(/,/, $configwords); my $w; foreach $w (@words) { while ($imgTextOcr =~ m/$w/gi) { $cnt++; } } debuglog($dbg, "number of spamwords found: $cnt"); } else { debuglog($dbg, "no result"); } if($cnt) { # finally give the score my $score = sprintf("%0.2f", $cnt * 0.8); $pms->got_hit("FOCR", "facileOCR: $cnt ", score => $score ); for my $set (0..3) { $pms->{conf}->{scoreset}->[$set]->{"FOCR"} = $score; } } return 0; } sub imageExtractionFromMSG # ($imgTextOcr, $numImages) = imageExtractionFromMSG($msg) # Return all text and the number of attached images # called by check_ocr { my $dbg = $_[0]; my $msg = $_[1]; my $conf = $_[2]; my @mimeStr = ("image/*", "img/*"); my $num=0; my $imgTextOcr = ""; my $maxbytes = ($maxsize * 1024) + 10; foreach (@mimeStr) { # Search all attachments with current MIME my @img_parts = $msg->find_parts($_); for (my $i=0; $i <= $#img_parts; $i++) { my $imagestream = $img_parts[$i]->decode($maxbytes); $imgTextOcr = join $imgTextOcr, textExtractionByOCR($dbg, $imagestream, $conf); $num++; } } return ($imgTextOcr, $num); } sub textExtractionByOCR # $textOut = textExtractionByOCR( $imagestream ) # tests, convert, text extraction # called by imageExtractionFromMSG { my $dbg = $_[0]; my $imagestream = $_[1]; my $conf = $_[2]; my $imagelen = sprintf("%0.2f", length($imagestream) / 1024); my ($fh_a, $tmpImg) = tempfile('/tmp/sa_tmpImg.XXXXXXXX'); # never bigger than config max size or smaller than 6kB if ($imagelen < 6) { debuglog($dbg, "skip, image size = $imagelen kB"); unlink($fh_a, $tmpImg); return ""; } if ($imagelen > $maxsize) { debuglog($dbg, "skip, image size > $maxsize kB"); unlink($fh_a, $tmpImg); return ""; } open (FILE, ">$tmpImg") or return ""; print FILE "$imagestream\n"; close FILE; my $imageIdentifyTxt = ""; # check WxH, type, layers open EXEFH, "identify -quiet -format '%wx%h %m %n' $tmpImg 2>/dev/null |"; $imageIdentifyTxt = join "", ; close EXEFH; if( $imageIdentifyTxt =~ /^(\d+)x(\d+) (GIF|PNG|JPEG).* ([1-9])$/ ) { my $width = $1; my $height = $2; my $magic = $3; my $layercnt = $4; debuglog($dbg, "img info: $width x $height $magic, $layercnt layer(s), $imagelen kB"); if($width * $height > 1024*1024 && ($width > 1024 or $height > 1024) ) { debuglog($dbg, "skip, image dimension = $width x $height"); unlink($fh_a, $tmpImg); return ""; } } else { debuglog($dbg, "identify: skip, wrong type, image broken or too many layers"); unlink($fh_a, $tmpImg); return ""; } # -append :: concatenate image # -flatten :: fuse layers # -depth :: 8-bit img # -background white :: convert tranparency my $convertOpts = "-append -flatten -depth 8 -background white"; my $textOut = ""; my $ocrProfile = ""; my $ocradOpts = ""; my ($fh_b, $tmpTxt) = tempfile('/tmp/sa_tmpTxt.XXXXXXXX'); if ($conf->{ocr_profile} =~ /^[1-3]$/) { $ocrProfile = $conf->{ocr_profile}; } else { $ocrProfile = 1; } SWITCH: { if ($ocrProfile == 1) {$ocradOpts = "-s 2 -F utf8 -e letters_only";} if ($ocrProfile == 2) {$ocradOpts = "-s 3 -i -F utf8 -e letters_only";} if ($ocrProfile == 3) {$ocradOpts = "-s 3 -T 38% -F utf8 -e letters_only";} } # OCR call with timeout my $child; my $error = undef; my ($fh_c, $sh_exit) = tempfile('/tmp/sa_shExit.XXXXXXXX'); eval { local $SIG{ALRM} = sub { die 'alarm' }; $child = fork; die 'fork' unless(defined $child); if($child){ alarm 10; wait; alarm 0; } else { exec("convert $convertOpts $tmpImg pnm:- 2>> $sh_exit|ocrad $ocradOpts >$tmpTxt 2>> $sh_exit"); exit; } }; # get shell errors (exec), if any if ( -s $sh_exit ) { open SHXT, "< $sh_exit"; my $shexit = join "",; close SHXT; debuglog($dbg, "shell errors: \n$shexit"); } # get timeout or fork errors, kill if needed if($error = $@) { if( $error =~ /alarm/) { alarm(0); kill 9, $child; debuglog($dbg, "convert/ocrad timeout: kill pid $child"); } elsif($error =~ /fork/) { debuglog($dbg, "Unable to fork: $error"); } else { die "Error: $error"; } } else { open OCRTXT, "< $tmpTxt"; $textOut = join "", ; $textOut =~ s/\s+$//mg; close OCRTXT; } unlink($fh_a, $tmpImg); unlink($fh_b, $tmpTxt); unlink($fh_c, $sh_exit); return $textOut; } sub debuglog() # write to a logfile { my ($dbg, $msg) = @_; if ($dbg eq "on") { my $timenow = localtime time; open (FILE, ">>/tmp/sa_OCR.log"); print FILE "$timenow -- $msg\n"; close FILE; } }