Hi,
cPanel has developed a native Perl Pyzor implementation for SpamAssassin
and a diff against SpamAssassin 4.0 follows.
Atm I am using it in production on a small server, more tests and
opinions are welcome.
Original cPanel code is at https://metacpan.org/pod/Mail::Pyzor.
Cheers
Giovanni
diff --git a/MANIFEST b/MANIFEST
index 25d0192..2d9588c 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -126,6 +126,11 @@ lib/Mail/SpamAssassin/Plugin/WLBLEval.pm
lib/Mail/SpamAssassin/Plugin/WhiteListSubject.pm
lib/Mail/SpamAssassin/PluginHandler.pm
lib/Mail/SpamAssassin/Plugin/URILocalBL.pm
+lib/Mail/SpamAssassin/Pyzor/Client.pm
+lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
+lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
+lib/Mail/SpamAssassin/Pyzor/Digest.pm
+lib/Mail/SpamAssassin/Pyzor.pm
lib/Mail/SpamAssassin/RegistryBoundaries.pm
lib/Mail/SpamAssassin/Reporter.pm
lib/Mail/SpamAssassin/SQLBasedAddrList.pm
diff --git a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
index 3efd4b4..e4c9c05 100644
--- a/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
+++ b/lib/Mail/SpamAssassin/Plugin/Pyzor.pm
@@ -36,17 +36,13 @@ package Mail::SpamAssassin::Plugin::Pyzor;
use Mail::SpamAssassin::Plugin;
use Mail::SpamAssassin::Logger;
-use Mail::SpamAssassin::Timeout;
-use Mail::SpamAssassin::Util qw(untaint_var untaint_file_path
- proc_status_ok exit_status_str);
+use Mail::SpamAssassin::Util qw(untaint_var);
+
use strict;
use warnings;
# use bytes;
use re 'taint';
-use Storable;
-use POSIX qw(PIPE_BUF WNOHANG _exit);
-
our @ISA = qw(Mail::SpamAssassin::Plugin);
sub new {
@@ -78,7 +74,7 @@ sub set_config {
my ($self, $conf) = @_;
my @cmds;
-=head1 USER OPTIONS
+=head1 ADMINISTRATOR OPTIONS
=over 4
@@ -95,22 +91,7 @@ Whether to use Pyzor, if it is available.
type => $Mail::SpamAssassin::Conf::CONF_TYPE_BOOL
});
-=item pyzor_fork (0|1) (default: 0)
-
-Instead of running Pyzor synchronously, fork separate process for it and
-read the results in later (similar to async DNS lookups). Increases
-throughput. Experimental.
-
-=cut
-
- push(@cmds, {
- setting => 'pyzor_fork',
- is_admin => 1,
- default => 0,
- type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
- });
-
-=item pyzor_count_min NUMBER (default: 5)
+=item pyzor_count_min NUMBER (default: 5)
This option sets how often a message's body checksum must have been
reported to the Pyzor server before SpamAssassin will consider the Pyzor
@@ -128,54 +109,8 @@ set this to a relatively low value, e.g. C<5>.
type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
});
- # Deprecated setting, the name makes no sense!
- push (@cmds, {
- setting => 'pyzor_max',
- is_admin => 1,
- type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC,
- code => sub {
- my ($self, $key, $value, $line) = @_;
- warn("deprecated setting used, change pyzor_max to pyzor_count_min\n");
- if ($value !~ /^\d+$/) {
- return $Mail::SpamAssassin::Conf::INVALID_VALUE;
- }
- $self->{pyzor_count_min} = $value;
- }
- });
-
-=item pyzor_whitelist_min NUMBER (default: 10)
-
-This option sets how often a message's body checksum must have been
-whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
-result. Final decision is made by pyzor_whitelist_factor.
-
-=cut
-
- push (@cmds, {
- setting => 'pyzor_whitelist_min',
- is_admin => 1,
- default => 10,
- type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
- });
-
-=item pyzor_whitelist_factor NUMBER (default: 0.2)
-
-Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
-For default setting this means: 50 reports requires 10 whitelistings.
-
-=cut
-
- push (@cmds, {
- setting => 'pyzor_whitelist_factor',
- is_admin => 1,
- default => 0.2,
- type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
- });
-
=back
-=head1 ADMINISTRATOR OPTIONS
-
=over 4
=item pyzor_timeout n (default: 5)
@@ -210,478 +145,182 @@ removing one of them.
type => $Mail::SpamAssassin::Conf::CONF_TYPE_DURATION
});
-=item pyzor_options options
+=item pyzor_whitelist_min NUMBER (default: 10)
-Specify additional options to the pyzor(1) command. Please note that only
-characters in the range [0-9A-Za-z =,._/-] are allowed for security reasons.
+This option sets how often a message's body checksum must have been
+whitelisted to the Pyzor server for SpamAssassin to consider ignoring the
+result. Final decision is made by pyzor_whitelist_factor.
=cut
push (@cmds, {
- setting => 'pyzor_options',
+ setting => 'pyzor_whitelist_min',
is_admin => 1,
- default => '',
- type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
- code => sub {
- my ($self, $key, $value, $line) = @_;
- if ($value !~ m{^([0-9A-Za-z =,._/-]+)$}) {
- return $Mail::SpamAssassin::Conf::INVALID_VALUE;
- }
- $self->{pyzor_options} = $1;
- }
+ default => 10,
+ type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
});
-=item pyzor_path STRING
+=item pyzor_whitelist_factor NUMBER (default: 0.2)
-This option tells SpamAssassin specifically where to find the C<pyzor>
-client instead of relying on SpamAssassin to find it in the current
-PATH. Note that if I<taint mode> is enabled in the Perl interpreter,
-you should use this, as the current PATH will have been cleared.
+Ignore Pyzor result if REPORTCOUNT x NUMBER >= pyzor_whitelist_min.
+For default setting this means: 50 reports requires 10 whitelistings.
=cut
push (@cmds, {
- setting => 'pyzor_path',
+ setting => 'pyzor_whitelist_factor',
is_admin => 1,
- default => undef,
- type => $Mail::SpamAssassin::Conf::CONF_TYPE_STRING,
- code => sub {
- my ($self, $key, $value, $line) = @_;
- if (!defined $value || !length $value) {
- return $Mail::SpamAssassin::Conf::MISSING_REQUIRED_VALUE;
- }
- $value = untaint_file_path($value);
- if (!-x $value) {
- info("config: pyzor_path \"$value\" isn't an executable");
- return $Mail::SpamAssassin::Conf::INVALID_VALUE;
- }
-
- $self->{pyzor_path} = $value;
- }
+ default => 0.2,
+ type => $Mail::SpamAssassin::Conf::CONF_TYPE_NUMERIC
});
$conf->{parser}->register_commands(\@cmds);
}
sub is_pyzor_available {
- my ($self) = @_;
+ my ($self) = @_;
- my $pyzor = $self->{main}->{conf}->{pyzor_path} ||
- Mail::SpamAssassin::Util::find_executable_in_env_path('pyzor');
-
- unless ($pyzor && -x $pyzor) {
- dbg("pyzor: no pyzor executable found");
- $self->{pyzor_available} = 0;
- return 0;
- }
-
- # remember any found pyzor
- $self->{main}->{conf}->{pyzor_path} = $pyzor;
-
- dbg("pyzor: pyzor is available: $pyzor");
- return 1;
+ local $@;
+ eval {
+ require Mail::SpamAssassin::Pyzor::Digest;
+ require Mail::SpamAssassin::Pyzor::Client;
+ };
+ return $@ ? 0 : 1;
}
-sub finish_parsing_start {
- my ($self, $opts) = @_;
+sub get_pyzor_interface {
+ my ($self) = @_;
- # If forking, hard adjust priority -100 to launch early
- # Find rulenames from eval_to_rule mappings
- if ($opts->{conf}->{pyzor_fork}) {
- foreach (@{$opts->{conf}->{eval_to_rule}->{check_pyzor}}) {
- dbg("pyzor: adjusting rule $_ priority to -100");
- $opts->{conf}->{priority}->{$_} = -100;
- }
+ if (!$self->{main}->{conf}->{use_pyzor}) {
+ dbg("pyzor: use_pyzor option not enabled, disabling Pyzor");
+ $self->{pyzor_interface} = "disabled";
+ $self->{pyzor_available} = 0;
+ }
+ elsif ($self->is_pyzor_available()) {
+ $self->{pyzor_interface} = "pyzor";
+ $self->{pyzor_available} = 1;
+ }
+ else {
+ dbg("pyzor: no pyzor found, disabling Pyzor");
+ $self->{pyzor_available} = 0;
}
}
sub check_pyzor {
- my ($self, $pms, $full) = @_;
-
- return 0 if !$self->{pyzor_available};
- return 0 if !$self->{main}->{conf}->{use_pyzor};
-
- return 0 if $pms->{pyzor_running};
- $pms->{pyzor_running} = 1;
-
- return 0 if !$self->is_pyzor_available();
-
- my $timer = $self->{main}->time_method("check_pyzor");
+ my ($self, $permsgstatus, $full) = @_;
# initialize valid tags
- $pms->{tag_data}->{PYZOR} = '';
-
- # create fulltext tmpfile now (before possible forking)
- $pms->{pyzor_tmpfile} = $pms->create_fulltext_tmpfile();
-
- ## non-forking method
-
- if (!$self->{main}->{conf}->{pyzor_fork}) {
- my @results = $self->pyzor_lookup($pms);
- return $self->_check_result($pms, \@results);
- }
-
- ## forking method
-
- $pms->{pyzor_rulename} = $pms->get_current_eval_rule_name();
- $pms->rule_pending($pms->{pyzor_rulename}); # mark async
-
- # create socketpair for communication
- $pms->{pyzor_backchannel} = Mail::SpamAssassin::SubProcBackChannel->new();
- my $back_selector = '';
- $pms->{pyzor_backchannel}->set_selector(\$back_selector);
- eval {
- $pms->{pyzor_backchannel}->setup_backchannel_parent_pre_fork();
- } or do {
- dbg("pyzor: backchannel pre-setup failed: $@");
- delete $pms->{pyzor_backchannel};
- return 0;
- };
+ $permsgstatus->{tag_data}->{PYZOR} = "";
- my $pid = fork();
- if (!defined $pid) {
- info("pyzor: child fork failed: $!");
- delete $pms->{pyzor_backchannel};
- return 0;
- }
- if (!$pid) {
- $0 = "$0 (pyzor)";
- $SIG{CHLD} = 'DEFAULT';
- $SIG{PIPE} = 'IGNORE';
- $SIG{$_} = sub {
- eval { dbg("pyzor: child process $$ caught signal $_[0]"); };
- _exit(6); # avoid END and destructor processing
- kill('KILL',$$); # still kicking? die!
- } foreach qw(INT HUP TERM TSTP QUIT USR1 USR2);
- dbg("pyzor: child process $$ forked");
- $pms->{pyzor_backchannel}->setup_backchannel_child_post_fork();
- my @results = $self->pyzor_lookup($pms);
- my $backmsg;
- eval {
- $backmsg = Storable::freeze(\@results);
- };
- if ($@) {
- dbg("pyzor: child return value freeze failed: $@");
- _exit(0); # avoid END and destructor processing
- }
- if (!syswrite($pms->{pyzor_backchannel}->{parent}, $backmsg)) {
- dbg("pyzor: child backchannel write failed: $!");
- }
- _exit(0); # avoid END and destructor processing
- }
-
- $pms->{pyzor_pid} = $pid;
+ my $timer = $self->{main}->time_method("check_pyzor");
- eval {
- $pms->{pyzor_backchannel}->setup_backchannel_parent_post_fork($pid);
- } or do {
- dbg("pyzor: backchannel post-setup failed: $@");
- delete $pms->{pyzor_backchannel};
- return 0;
- };
+ $self->get_pyzor_interface();
+ return 0 unless $self->{pyzor_available};
- return 0;
+ return $self->pyzor_lookup($permsgstatus, $full);
}
sub pyzor_lookup {
- my ($self, $pms) = @_;
-
- my $conf = $self->{main}->{conf};
- my $timeout = $conf->{pyzor_timeout};
-
- # note: not really tainted, this came from system configuration file
- my $path = untaint_file_path($conf->{pyzor_path});
- my $opts = untaint_var($conf->{pyzor_options}) || '';
-
- $pms->enter_helper_run_mode();
-
- my $pid;
- my @resp;
- my $timer = Mail::SpamAssassin::Timeout->new(
- { secs => $timeout, deadline => $pms->{master_deadline} });
- my $err = $timer->run_and_catch(sub {
- local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
-
- dbg("pyzor: opening pipe: ".
- join(' ', $path, $opts, "check", "<".$pms->{pyzor_tmpfile}));
-
- $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
- $pms->{pyzor_tmpfile}, 1, $path, split(' ', $opts), "check");
- $pid or die "$!\n";
-
- # read+split avoids a Perl I/O bug (Bug 5985)
- my($inbuf, $nread);
- my $resp = '';
- while ($nread = read(PYZOR, $inbuf, 8192)) { $resp .= $inbuf }
- defined $nread or die "error reading from pipe: $!";
- @resp = split(/^/m, $resp, -1);
-
- my $errno = 0;
- close PYZOR or $errno = $!;
- if (proc_status_ok($?, $errno)) {
- dbg("pyzor: [%s] finished successfully", $pid);
- } elsif (proc_status_ok($?, $errno, 0, 1)) { # sometimes it exits with 1
- dbg("pyzor: [%s] finished: %s", $pid, exit_status_str($?, $errno));
- } else {
- info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
- }
-
- });
-
- if (defined(fileno(*PYZOR))) { # still open
- if ($pid) {
- if (kill('TERM', $pid)) {
- dbg("pyzor: killed stale helper [$pid]");
- } else {
- dbg("pyzor: killing helper application [$pid] failed: $!");
- }
- }
- my $errno = 0;
- close PYZOR or $errno = $!;
- proc_status_ok($?, $errno)
- or info("pyzor: [%s] error: %s", $pid, exit_status_str($?, $errno));
- }
-
- $pms->leave_helper_run_mode();
-
- if ($timer->timed_out()) {
- dbg("pyzor: check timed out after $timeout seconds");
- return ();
- } elsif ($err) {
- chomp $err;
- info("pyzor: check failed: $err");
- return ();
- }
-
- return @resp;
-}
-
-sub check_tick {
- my ($self, $opts) = @_;
- $self->_check_forked_result($opts->{permsgstatus}, 0);
-}
-
-sub check_cleanup {
- my ($self, $opts) = @_;
- $self->_check_forked_result($opts->{permsgstatus}, 1);
-}
-
-sub _check_forked_result {
- my ($self, $pms, $finish) = @_;
-
- return 0 if !$pms->{pyzor_backchannel};
- return 0 if !$pms->{pyzor_pid};
+ my ( $self, $permsgstatus, $fulltext ) = @_;
+ my $conf = $self->{main}->{conf};
+ my $timeout = $conf->{pyzor_timeout};
+
+ my $client = ( $self->{'_pyzor_client'} ||=
Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
+ my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $fulltext );
+
+ local $@;
+ my $ref = eval { $client->check($digest); };
+ dbg("pyzor: got response: $client->{'_server_host'}");
+ # $client reply must be an hash
+ return 0 if (not (ref $ref eq ref {}));
+ if ($@) {
+ my $err = $@;
- my $timer = $self->{main}->time_method("check_pyzor");
+ $err = eval { $err->get_message() } || $err;
- $pms->{pyzor_abort} = $pms->{deadline_exceeded} || $pms->{shortcircuited};
-
- my $kid_pid = $pms->{pyzor_pid};
- # if $finish, force waiting for the child
- my $pid = waitpid($kid_pid, $finish && !$pms->{pyzor_abort} ? 0 : WNOHANG);
- if ($pid == 0) {
- #dbg("pyzor: child process $kid_pid not finished yet, trying later");
- if ($pms->{pyzor_abort}) {
- dbg("pyzor: bailing out due to deadline/shortcircuit");
- kill('TERM', $kid_pid);
- if (waitpid($kid_pid, WNOHANG) == 0) {
- sleep(1);
- if (waitpid($kid_pid, WNOHANG) == 0) {
- dbg("pyzor: child process $kid_pid still alive, KILL");
- kill('KILL', $kid_pid);
- waitpid($kid_pid, 0);
+ warn("pyzor: check failed: $err\n");
+ return 0;
+ } elsif ( defined $ref->{'Code'} and $ref->{'Code'} ne 200 ) {
+ if(defined $ref->{'Code'} and defined $ref->{'Diag'}) {
+ dbg("pyzor: check failed with invalid code: $ref->{'Code'}:
$ref->{'Diag'}");
+ } else {
+ dbg("pyzor: check failed with undefined code");
}
- }
- delete $pms->{pyzor_pid};
- delete $pms->{pyzor_backchannel};
+ return 0;
}
- return 0;
- } elsif ($pid == -1) {
- # child does not exist?
- dbg("pyzor: child process $kid_pid already handled?");
- delete $pms->{pyzor_backchannel};
- return 0;
- }
- $pms->rule_ready($pms->{pyzor_rulename}); # mark rule ready for metas
+ my $pyzor_count = untaint_var($ref->{'Count'}) + 0;
+ my $pyzor_whitelisted = untaint_var($ref->{'WL-Count'}) + 0;
+ my $count_min = $conf->{pyzor_count_min};
+ my $wl_min = $conf->{pyzor_whitelist_min};
- dbg("pyzor: child process $kid_pid finished, reading results");
+ my $wl_limit = $pyzor_whitelisted >= $wl_min ?
+ $pyzor_count * $conf->{pyzor_whitelist_factor} : 0;
- my $backmsg;
- my $ret = sysread($pms->{pyzor_backchannel}->{latest_kid_fh}, $backmsg,
PIPE_BUF);
- if (!defined $ret || $ret == 0) {
- dbg("pyzor: could not read result from child: ".($ret == 0 ? 0 : $!));
- delete $pms->{pyzor_backchannel};
- return 0;
- }
-
- delete $pms->{pyzor_backchannel};
+ $permsgstatus->set_tag('PYZOR', "Reported $pyzor_count times, whitelisted
$pyzor_whitelisted times.");
- my $results;
- eval {
- $results = Storable::thaw($backmsg);
- };
- if ($@) {
- dbg("pyzor: child return value thaw failed: $@");
- return;
- }
-
- $self->_check_result($pms, $results);
-}
+ dbg("pyzor: result: COUNT=$pyzor_count/$count_min
WHITELIST=$pyzor_whitelisted/$wl_min/%.1f",
+ $wl_limit);
-sub _check_result {
- my ($self, $pms, $results) = @_;
-
- if (!@$results) {
- dbg("pyzor: no response from server");
- return 0;
- }
-
- my $count = 0;
- my $count_wl = 0;
- foreach my $res (@$results) {
- chomp($res);
- if ($res =~ /^Traceback/) {
- info("pyzor: internal error, python traceback seen in response: $res");
+ # Empty body etc results in same hash, we should skip very large numbers..
+ if ($pyzor_count >= 1000000 || $pyzor_whitelisted >= 10000) {
+ dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl
1000000/10000");
return 0;
}
- dbg("pyzor: got response: $res");
- # this regexp is intended to be a little bit forgiving
- if ($res =~ /^\S+\t.*?\t(\d+)\t(\d+)\s*$/) {
- # until pyzor servers can sync their DBs,
- # sum counts obtained from all servers
- $count += untaint_var($1)+0; # crazy but needs untainting
- $count_wl += untaint_var($2)+0;
- } else {
- # warn on failures to parse
- info("pyzor: failure to parse response \"$res\"");
- }
- }
-
- my $conf = $self->{main}->{conf};
-
- my $count_min = $conf->{pyzor_count_min};
- my $wl_min = $conf->{pyzor_whitelist_min};
- my $wl_limit = $count_wl >= $wl_min ?
- $count * $conf->{pyzor_whitelist_factor} : 0;
-
- dbg("pyzor: result: COUNT=$count/$count_min
WHITELIST=$count_wl/$wl_min/%.1f",
- $wl_limit);
- $pms->set_tag('PYZOR', "Reported $count times, whitelisted $count_wl
times.");
-
- # Empty body etc results in same hash, we should skip very large numbers..
- if ($count >= 1000000 || $count_wl >= 10000) {
- dbg("pyzor: result exceeded hardcoded limits, ignoring: count/wl
1000000/10000");
- return 0;
- }
-
- # Whitelisted?
- if ($wl_limit && $count_wl >= $wl_limit) {
- dbg("pyzor: message whitelisted");
- return 0;
- }
+ # Whitelisted?
+ if ($wl_limit && $pyzor_whitelisted >= $wl_limit) {
+ dbg("pyzor: message whitelisted");
+ return 0;
+ }
- if ($count >= $count_min) {
- if ($conf->{pyzor_fork}) {
- # forked needs to run got_hit()
- $pms->got_hit($pms->{pyzor_rulename}, "", ruletype => 'eval');
+ if ( $pyzor_count >= $count_min ) {
+ return 1;
}
- return 1;
- }
- return 0;
+ return 0;
}
sub plugin_report {
my ($self, $options) = @_;
- return if !$self->{pyzor_available};
- return if !$self->{main}->{conf}->{use_pyzor};
- return if $options->{report}->{options}->{dont_report_to_pyzor};
- return if !$self->is_pyzor_available();
-
- # use temporary file: open2() is unreliable due to buffering under spamd
- my $tmpf = $options->{report}->create_fulltext_tmpfile($options->{text});
- if ($self->pyzor_report($options, $tmpf)) {
- $options->{report}->{report_available} = 1;
- info("reporter: spam reported to Pyzor");
- $options->{report}->{report_return} = 1;
- }
- else {
- info("reporter: could not report spam to Pyzor");
- }
- $options->{report}->delete_fulltext_tmpfile($tmpf);
+ return unless $self->{pyzor_available};
+ return unless $self->{main}->{conf}->{use_pyzor};
- return 1;
+ if (!$options->{report}->{options}->{dont_report_to_pyzor} &&
$self->is_pyzor_available())
+ {
+ if ($self->pyzor_report($options)) {
+ $options->{report}->{report_available} = 1;
+ info("reporter: spam reported to Pyzor");
+ $options->{report}->{report_return} = 1;
+ }
+ else {
+ info("reporter: could not report spam to Pyzor");
+ }
+ }
}
sub pyzor_report {
- my ($self, $options, $tmpf) = @_;
-
- # note: not really tainted, this came from system configuration file
- my $path = untaint_file_path($options->{report}->{conf}->{pyzor_path});
- my $opts = untaint_var($options->{report}->{conf}->{pyzor_options}) || '';
+ my ( $self, $options ) = @_;
- my $timeout = $self->{main}->{conf}->{pyzor_timeout};
+ my $timeout = $self->{main}->{conf}->{pyzor_timeout};
- $options->{report}->enter_helper_run_mode();
+ my $client = ( $self->{'_pyzor_client'} ||=
Mail::SpamAssassin::Pyzor::Client->new( 'timeout' => $timeout ) );
- my $timer = Mail::SpamAssassin::Timeout->new({ secs => $timeout });
- my $err = $timer->run_and_catch(sub {
+ my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $options->{'text'} );
- local $SIG{PIPE} = sub { die "__brokenpipe__ignore__\n" };
-
- dbg("pyzor: opening pipe: " . join(' ', $path, $opts, "report", "<
$tmpf"));
-
- my $pid = Mail::SpamAssassin::Util::helper_app_pipe_open(*PYZOR,
- $tmpf, 1, $path, split(' ', $opts), "report");
- $pid or die "$!\n";
-
- my($inbuf,$nread,$nread_all); $nread_all = 0;
- # response is ignored, just check its existence
- while ( $nread=read(PYZOR,$inbuf,8192) ) { $nread_all += $nread }
- defined $nread or die "error reading from pipe: $!";
-
- dbg("pyzor: empty response") if $nread_all < 1;
-
- my $errno = 0; close PYZOR or $errno = $!;
- # closing a pipe also waits for the process executing on the pipe to
- # complete, no need to explicitly call waitpid
- # my $child_stat = waitpid($pid,0) > 0 ? $? : undef;
- if (proc_status_ok($?,$errno, 0)) {
- dbg("pyzor: [%s] reporter finished successfully", $pid);
- } else {
- info("pyzor: [%s] reporter error: %s", $pid, exit_status_str($?,$errno));
+ local $@;
+ my $ref = eval { $client->report($digest); };
+ if ($@) {
+ warn("pyzor: report failed: $@");
+ return 0;
}
-
- });
-
- $options->{report}->leave_helper_run_mode();
-
- if ($timer->timed_out()) {
- dbg("reporter: pyzor report timed out after $timeout seconds");
- return 0;
- }
-
- if ($err) {
- chomp $err;
- if ($err eq '__brokenpipe__ignore__') {
- dbg("reporter: pyzor report failed: broken pipe");
- } else {
- warn("reporter: pyzor report failed: $err\n");
+ elsif ( $ref->{'Code'} ne 200 ) {
+ dbg("pyzor: report failed with invalid code: $ref->{'Code'}:
$ref->{'Diag'}");
+ return 0;
}
- return 0;
- }
- return 1;
+ return 1;
}
-# Version features
-sub has_fork { 1 }
-
1;
-
-=back
-
-=cut
diff --git a/lib/Mail/SpamAssassin/Pyzor.pm b/lib/Mail/SpamAssassin/Pyzor.pm
new file mode 100644
index 0000000..8ac27f4
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor.pm
@@ -0,0 +1,56 @@
+package Mail::SpamAssassin::Pyzor;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+#
+
+use strict;
+use warnings;
+
+our $VERSION = '0.06_01';
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor - Pyzor spam filtering in Perl
+
+=head1 DESCRIPTION
+
+This distribution contains Perl implementations of parts of
+L<Pyzor|http://pyzor.org>, a tool for use in spam email filtering.
+It is intended for use with L<Mail::SpamAssassin> but may be useful
+in other contexts.
+
+See the following modules for information on specific tools that
+the distribution includes:
+
+=over
+
+=item * L<Mail::SpamAssassin::Pyzor::Client>
+
+=item * L<Mail::SpamAssassin::Pyzor::Digest>
+
+=back
+
+=cut
+
+1;
diff --git a/lib/Mail/SpamAssassin/Pyzor/Client.pm
b/lib/Mail/SpamAssassin/Pyzor/Client.pm
new file mode 100644
index 0000000..ccff868
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor/Client.pm
@@ -0,0 +1,415 @@
+package Mail::SpamAssassin::Pyzor::Client;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+#
+
+use strict;
+use warnings;
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor::Client - Pyzor client logic
+
+=head1 SYNOPSIS
+
+ use Mail::SpamAssassin::Pyzor::Client ();
+ use Mail::SpamAssassin::Pyzor::Digest ();
+
+ my $client = Mail::SpamAssassin::Pyzor::Client->new();
+
+ my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $msg );
+
+ my $check_ref = $client->check($digest);
+ die $check_ref->{'Diag'} if $check_ref->{'Code'} ne '200';
+
+ my $report_ref = $client->report($digest);
+ die $report_ref->{'Diag'} if $report_ref->{'Code'} ne '200';
+
+=head1 DESCRIPTION
+
+A bare-bones L<Pyzor|http://pyzor.org> client that currently only
+implements the functionality needed for L<Mail::SpamAssassin>.
+
+=head1 PROTOCOL DETAILS
+
+The Pyzor protocol is not a published standard, and there appears to be
+no meaningful public documentation. What follows is enough information,
+largely gleaned through forum posts and reverse engineering, to facilitate
+effective use of this module:
+
+Pyzor is an RPC-oriented, message-based protocol. Each message
+is a simple dictionary of 7-bit ASCII keys and values. Server responses
+always include at least the following:
+
+=over
+
+=item * C<Code> - Similar to HTTP status codes; anything besides C<200>
+is an error.
+
+=item * C<Diag> - Similar to HTTP status reasons: a text description
+of the status.
+
+=back
+
+(NB: There are additional standard response headers that are useful only for
+the protocol itself and thus are not part of this module???s returns.)
+
+=head2 Reliability
+
+Pyzor uses UDP rather than TCP, so no message is guaranteed to reach its
+destination. A transmission failure can happen in either the request or
+the response; in either case, a timeout error will result. Such errors
+are represented as thrown instances of L<Mail::Pyzor::X::Timeout>.
+
+=cut
+
+#----------------------------------------------------------------------
+
+our $VERSION = '0.04';
+
+our $DEFAULT_SERVER_HOST = 'public.pyzor.org';
+our $DEFAULT_SERVER_PORT = 24441;
+our $DEFAULT_USERNAME = 'anonymous';
+our $DEFAULT_PASSWORD = '';
+our $DEFAULT_OP_SPEC = '20,3,60,3';
+our $PYZOR_PROTOCOL_VERSION = 2.1;
+our $DEFAULT_TIMEOUT = 3.5;
+our $READ_SIZE = 8192;
+
+use IO::Socket::INET ();
+use Digest::SHA qw(sha1 sha1_hex);
+
+my @hash_order = ( 'Op', 'Op-Digest', 'Op-Spec', 'Thread', 'PV', 'User',
'Time', 'Sig' );
+
+#----------------------------------------------------------------------
+
+=head1 CONSTRUCTOR
+
+=head2 new(%OPTS)
+
+Create a new pyzor client.
+
+=over 2
+
+=item Input
+
+%OPTS are (all optional):
+
+=over 3
+
+=item * C<server_host> - The pyzor server host to connect to (default is
+C<public.pyzor.org>)
+
+=item * C<server_port> - The pyzor server port to connect to (default is
+24441)
+
+=item * C<username> - The username to present to the pyzor server (default
+is C<anonymous>)
+
+=item * C<password> - The password to present to the pyzor server (default
+is empty)
+
+=item * C<timeout> - The maximum time, in seconds, to wait for a response
+from the pyzor server (defeault is 3.5)
+
+=back
+
+=item Output
+
+=over 3
+
+Returns a L<Mail::SpamAssassin::Pyzor::Client> object.
+
+=back
+
+=back
+
+=cut
+
+sub new {
+ my ( $class, %OPTS ) = @_;
+
+ return bless {
+ '_server_host' => $OPTS{'server_host'} || $DEFAULT_SERVER_HOST,
+ '_server_port' => $OPTS{'server_port'} || $DEFAULT_SERVER_PORT,
+ '_username' => $OPTS{'username'} || $DEFAULT_USERNAME,
+ '_password' => $OPTS{'password'} || $DEFAULT_PASSWORD,
+ '_op_spec' => $DEFAULT_OP_SPEC,
+ '_timeout' => $OPTS{'timeout'} || $DEFAULT_TIMEOUT,
+ }, $class;
+}
+
+#----------------------------------------------------------------------
+
+=head1 REQUEST METHODS
+
+=head2 report($digest)
+
+Report the digest of a spam message to the pyzor server. This function
+will throw if a messaging failure or timeout happens.
+
+=over 2
+
+=item Input
+
+=over 3
+
+=item $digest C<SCALAR>
+
+The message digest to report, as given by
+C<Mail::SpamAssassin::Pyzor::Digest::get()>.
+
+=back
+
+=item Output
+
+=over 3
+
+=item C<HASHREF>
+
+Returns a hashref of the standard attributes noted above.
+
+=back
+
+=back
+
+=cut
+
+sub report {
+ my ( $self, $digest ) = @_;
+
+ my $msg_ref = $self->_get_base_msg( 'report', $digest );
+
+ $msg_ref->{'Op-Spec'} = $self->{'_op_spec'};
+
+ return $self->_send_receive_msg($msg_ref);
+}
+
+=head2 check($digest)
+
+Check the digest of a message to see if
+the pyzor server has a report for it. This function
+will throw if a messaging failure or timeout happens.
+
+=over 2
+
+=item Input
+
+=over 3
+
+=item $digest C<SCALAR>
+
+The message digest to check, as given by
+C<Mail::SpamAssassin::Pyzor::Digest::get()>.
+
+=back
+
+=item Output
+
+=over 3
+
+=item C<HASHREF>
+
+Returns a hashref of the standard attributes noted above
+as well as the following:
+
+=over
+
+=item * C<Count> - The number of reports the server has received
+for the given digest.
+
+=item * C<WL-Count> - The number of whitelist requests the server has received
+for the given digest.
+
+=back
+
+=back
+
+=back
+
+=cut
+
+sub check {
+ my ( $self, $digest ) = @_;
+
+ return $self->_send_receive_msg( $self->_get_base_msg( 'check', $digest )
);
+}
+
+# ----------------------------------------
+
+sub _send_receive_msg {
+ my ( $self, $msg_ref ) = @_;
+
+ my $thread_id = $msg_ref->{'Thread'} or warn 'No thread ID?';
+
+ $self->_sign_msg($msg_ref);
+
+ return $self->_do_send_receive(
+ $self->_generate_packet_from_message($msg_ref) . "\n\n",
+ $thread_id,
+ );
+}
+
+sub _get_base_msg {
+ my ( $self, $op, $digest ) = @_;
+
+ die "Implementor error: op is required" if !$op;
+ die "error: digest is required" if !$digest;
+
+ return {
+ 'User' => $self->{'_username'},
+ 'PV' => $PYZOR_PROTOCOL_VERSION,
+ 'Time' => time(),
+ 'Op' => $op,
+ 'Op-Digest' => $digest,
+ 'Thread' => $self->_generate_thread_id()
+ };
+}
+
+sub _do_send_receive {
+ my ( $self, $packet, $thread_id ) = @_;
+
+ my $sock = $self->_get_connection_or_die();
+
+ $self->_send_packet( $sock, $packet );
+ my $response = $self->_receive_packet( $sock, $thread_id );
+
+ return 0 if not defined $response;
+
+ my $resp_hr = { map { ( split(m{: }) )[ 0, 1 ] } split( m{\n}, $response )
};
+
+ delete $resp_hr->{'Thread'};
+
+ my $response_pv = delete $resp_hr->{'PV'};
+
+ if ( $PYZOR_PROTOCOL_VERSION ne $response_pv ) {
+ warn "Unexpected protocol version ($response_pv) in Pyzor response!";
+ }
+
+ return $resp_hr;
+}
+
+sub _receive_packet {
+ my ( $self, $sock, $thread_id ) = @_;
+
+ my $timeout = $self->{'_timeout'} * 1000;
+
+ my $end_time = time + $self->{'_timeout'};
+
+ $sock->blocking(0);
+ my $response = '';
+ my $rout = '';
+ my $rin = '';
+ vec( $rin, fileno($sock), 1 ) = 1;
+
+ while (1) {
+ my $time_left = $end_time - time;
+
+ if ( $time_left <= 0 ) {
+ warn("Did not receive a response from the pyzor server
$self->{'_server_host'}:$self->{'_server_port'} for $self->{'_timeout'} seconds!");
+ return;
+ }
+
+ my $bytes = sysread( $sock, $response, $READ_SIZE, length $response );
+ if ( !defined($bytes) && !$!{'EAGAIN'} && !$!{'EWOULDBLOCK'} ) {
+ warn "read from socket: $!";
+ }
+
+ if ( index( $response, "\n\n" ) > -1 ) {
+
+ # Reject the response unless its thread ID matches what we sent.
+ # This prevents confusion among concurrent Pyzor reqeusts.
+ if ( index( $response, "\nThread: $thread_id\n" ) != -1 ) {
+ last;
+ }
+ else {
+ $response = '';
+ }
+ }
+
+ my $found = select( $rout = $rin, undef, undef, $time_left );
+ warn "select(): $!" if $found == -1;
+ }
+
+ return $response;
+}
+
+sub _send_packet {
+ my ( $self, $sock, $packet ) = @_;
+
+ $sock->blocking(1);
+ syswrite( $sock, $packet ) or warn "write to socket: $!";
+
+ return;
+}
+
+sub _get_connection_or_die {
+ my ($self) = @_;
+
+ # clear the socket if the PID changes
+ if ( defined $self->{'_sock_pid'} && $self->{'_sock_pid'} != $$ ) {
+ undef $self->{'_sock_pid'};
+ undef $self->{'_sock'};
+ }
+
+ $self->{'_sock_pid'} ||= $$;
+ $self->{'_sock'} ||= IO::Socket::INET->new(
+ 'PeerHost' => $self->{'_server_host'},
+ 'PeerPort' => $self->{'_server_port'},
+ 'Proto' => 'udp'
+ ) or die "Cannot connect to $self->{'_server_host'}:$self->{'_server_port'}: $@
$!";
+
+ return $self->{'_sock'};
+}
+
+sub _sign_msg {
+ my ( $self, $msg_ref ) = @_;
+
+ $msg_ref->{'Sig'} = lc Digest::SHA::sha1_hex(
+ Digest::SHA::sha1( $self->_generate_packet_from_message($msg_ref) )
+ );
+
+ return 1;
+}
+
+sub _generate_packet_from_message {
+ my ( $self, $msg_ref ) = @_;
+
+ return join( "\n", map { "$_: $msg_ref->{$_}" } grep { length
$msg_ref->{$_} } @hash_order );
+}
+
+sub _generate_thread_id {
+ my $RAND_MAX = 2**16;
+ my $val = 0;
+ $val = int rand($RAND_MAX) while $val < 1024;
+ return $val;
+}
+
+sub _get_user_pass_hash_key {
+ my ($self) = @_;
+
+ return lc Digest::SHA::sha1_hex( $self->{'_username'} . ':' .
$self->{'_password'} );
+}
+
+1;
diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest.pm
b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
new file mode 100644
index 0000000..0e8a5ae
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor/Digest.pm
@@ -0,0 +1,103 @@
+package Mail::SpamAssassin::Pyzor::Digest;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+#
+
+use strict;
+use warnings;
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor::Digest
+
+=head1 SYNOPSIS
+
+ my $digest = Mail::SpamAssassin::Pyzor::Digest::get( $mime_text );
+
+=head1 DESCRIPTION
+
+A reimplementation of
L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>.
+
+=cut
+
+#----------------------------------------------------------------------
+
+use Email::MIME ();
+
+use Mail::SpamAssassin::Pyzor::Digest::Pieces ();
+use Digest::SHA qw(sha1_hex);
+
+our $VERSION = '0.03';
+
+#----------------------------------------------------------------------
+
+=head1 FUNCTIONS
+
+=head2 $hex = get( $MSG )
+
+This takes an email message in raw MIME text format (i.e., as saved in the
+standard mbox format) and returns the message???s Pyzor digest in lower-case
+hexadecimal.
+
+The output from this function should normally be identical to that of
+the C<pyzor> script???s C<digest> command. It is suitable for use in
+L<Mail::SpamAssassin::Pyzor::Client>???s request methods.
+
+=cut
+
+sub get {
+ my ($text) = @_;
+ return Digest::SHA::sha1_hex( ${ _get_predigest( $text ) } );
+}
+
+# NB: This is called from the test.
+sub _get_predigest { ## no critic qw(RequireArgUnpacking)
+ my ($msg_text_sr) = @_;
+
+ my $parsed = Email::MIME->new($$msg_text_sr);
+
+ my @lines;
+
+ my $payloads_ar =
Mail::SpamAssassin::Pyzor::Digest::Pieces::digest_payloads($parsed);
+
+ for my $payload (@$payloads_ar) {
+ my @p_lines =
Mail::SpamAssassin::Pyzor::Digest::Pieces::splitlines($payload);
+ for my $line (@p_lines) {
+ Mail::SpamAssassin::Pyzor::Digest::Pieces::normalize($line);
+
+ next if
!Mail::SpamAssassin::Pyzor::Digest::Pieces::should_handle_line($line);
+
+ # Make sure we have an octet string.
+ utf8::encode($line) if utf8::is_utf8($line);
+
+ push @lines, $line;
+ }
+ }
+
+ my $digest_sr = Mail::SpamAssassin::Pyzor::Digest::Pieces::assemble_lines(
\@lines );
+
+ return $digest_sr;
+}
+
+1;
diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
new file mode 100644
index 0000000..522accd
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor/Digest/Pieces.pm
@@ -0,0 +1,301 @@
+package Mail::SpamAssassin::Pyzor::Digest::Pieces;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+#
+
+use strict;
+use warnings;
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor::Digest::Pieces
+
+=head1 DESCRIPTION
+
+This module houses backend logic for L<Mail::SpamAssassin::Pyzor::Digest>.
+
+It reimplements logic found in pyzor???s F<digest.py> module
+(L<https://github.com/SpamExperts/pyzor/blob/master/pyzor/digest.py>).
+
+=cut
+
+#----------------------------------------------------------------------
+
+use Email::MIME::ContentType ();
+use Encode ();
+
+our $VERSION = '0.03';
+
+# each tuple is [ offset, length ]
+use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] );
+
+use constant {
+ _MIN_LINE_LENGTH => 8,
+
+ _ATOMIC_NUM_LINES => 4,
+};
+
+#----------------------------------------------------------------------
+
+=head1 FUNCTIONS
+
+=head2 $strings_ar = digest_payloads( $EMAIL_MIME )
+
+This imitates the corresponding object method in F<digest.py>.
+It returns a reference to an array of strings. Each string can be either
+a byte string or a character string (e.g., UTF-8 decoded).
+
+NB: RFC 2822 stipulates that message bodies should use CRLF
+line breaks, not plain LF (nor plain CR). L<Email::MIME::Encodings>
+will thus convert any plain CRs in a quoted-printable message
+body into CRLF. Python, though, doesn???t do this, so the output of
+our implementation of C<digest_payloads()> diverges from that of the Python
+original. It doesn???t ultimately make a difference since the line-ending
+whitespace gets trimmed regardless, but it???s necessary to factor in when
+comparing the output of our implementation with the Python output.
+
+=cut
+
+sub digest_payloads {
+ my ($parsed) = @_;
+
+ my @subparts = $parsed->subparts();
+
+ my @payloads;
+
+ if (@subparts) {
+ @payloads = map { @{ digest_payloads($_) } } $parsed->subparts();
+ }
+ else {
+ my ( $main_type, $subtype, $encoding, $encode_check ) =
parse_content_type( $parsed->content_type() );
+
+ my $payload;
+
+ if ( $main_type eq 'text' ) {
+
+ # Decode transfer encoding, but leave us as a byte string.
+ # Note that this is where Email::MIME converts plain LF to CRLF.
+ $payload = $parsed->body();
+
+ # This does the actual character decoding (i.e., ???charset???).
+ $payload = Encode::decode( $encoding, $payload, $encode_check );
+
+ if ( $subtype eq 'html' ) {
+ require Mail::SpamAssassin::Pyzor::Digest::StripHtml;
+ $payload =
Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($payload);
+ }
+ }
+ else {
+
+ # This does no decoding, even of, e.g., quoted-printable or base64.
+ $payload = $parsed->body_raw();
+ }
+
+ push @payloads, $payload;
+ }
+
+ return \@payloads;
+}
+
+#----------------------------------------------------------------------
+
+=head2 normalize( $STRING )
+
+This imitates the corresponding object method in F<digest.py>.
+It modifies C<$STRING> in-place.
+
+As with the original implementation, if C<$STRING> contains (decoded)
+Unicode characters, those characters will be parsed accordingly. So:
+
+ $str = "123\xc2\xa0"; # [ c2 a0 ] == \u00a0, non-breaking space
+
+ normalize($str);
+
+The above will leave C<$str> alone, but this:
+
+ utf8::decode($str);
+
+ normalize($str);
+
+??? will trim off the last two bytes from C<$str>.
+
+=cut
+
+sub normalize { ## no critic qw( Subroutines::RequireArgUnpacking )
+
+ # NULs are bad, mm-kay?
+ $_[0] =~ tr<\0><>d;
+
+ # NB: Python???s \s without re.UNICODE is the same as Perl???s \s
+ # with the /a modifier.
+ #
+ # https://docs.python.org/2/library/re.html
+ # https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences
+
+ # Python: re.compile(r'\S{10,}')
+ $_[0] =~ s<\S{10,}><>ag;
+
+ # Python: re.compile(r'\S+@\S+')
+ $_[0] =~ s<\S+ @ \S+><>agx;
+
+ # Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE)
+ $_[0] =~ s<[a-zA-Z]+ : \S+><>agx;
+
+ # (from digest.py ???)
+ # Make sure we do the whitespace last because some of the previous
+ # patterns rely on whitespace.
+ $_[0] =~ tr< \x09-\x0d><>d;
+
+ # This is fun. digest.py???s normalize() does a non-UNICODE whitespace
+ # strip, then calls strip() on the string, which *will* strip Unicode
+ # whitespace from the ends.
+ $_[0] =~ s<\A\s+><>;
+ $_[0] =~ s<\s+\z><>;
+
+ return;
+}
+
+#----------------------------------------------------------------------
+
+=head2 $yn = should_handle_line( $STRING )
+
+This imitates the corresponding object method in F<digest.py>.
+It returns a boolean.
+
+=cut
+
+sub should_handle_line {
+ return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH();
+}
+
+#----------------------------------------------------------------------
+
+=head2 $sr = assemble_lines( \@LINES )
+
+This assembles a string buffer out of @LINES. The string is the buffer
+of octets that will be hashed to produce the message digest.
+
+Each member of @LINES is expected to be an B<octet string>, not a
+character string.
+
+=cut
+
+sub assemble_lines {
+ my ($lines_ar) = @_;
+
+ if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) {
+
+ # cf. handle_atomic() in digest.py
+ return \join( q<>, @$lines_ar );
+ }
+
+ #----------------------------------------------------------------------
+ # cf. handle_atomic() in digest.py
+
+ my $str = q<>;
+
+ for my $ofs_len ( _HASH_SPEC() ) {
+ my ( $offset, $length ) = @$ofs_len;
+
+ for my $i ( 0 .. ( $length - 1 ) ) {
+ my $idx = int( $offset * @$lines_ar / 100 ) + $i;
+
+ next if !defined $lines_ar->[$idx];
+
+ $str .= $lines_ar->[$idx];
+ }
+ }
+
+ return \$str;
+}
+
+#----------------------------------------------------------------------
+
+=head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE
)
+
+=cut
+
+use constant _QUOTED_PRINTABLE_NAMES => (
+ "quopri-codec",
+ "quopri",
+ "quoted-printable",
+ "quotedprintable",
+);
+
+# Make Encode::decode() ignore anything that doesn???t fit the
+# given encoding.
+use constant _encode_check_ignore => q<>;
+
+sub parse_content_type {
+ my ($content_type) = @_;
+
+ $Email::MIME::ContentType::STRICT_PARAMS = 0;
+ my $ct_parse = Email::MIME::ContentType::parse_content_type(
+ $content_type,
+ );
+
+ my $main = $ct_parse->{'type'} || q<>;
+ my $sub = $ct_parse->{'subtype'} || q<>;
+
+ my $encoding = $ct_parse->{'attributes'}{'charset'};
+
+ my $checkval;
+
+ if ($encoding) {
+
+ # Lower-case everything, convert underscore to dash, and remove NUL.
+ $encoding =~ tr<A-Z_\0><a-z->d;
+
+ # Apparently pyzor accommodates messages that put the transfer
+ # encoding in the Content-Type.
+ if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) {
+ $checkval = Encode::FB_CROAK();
+ }
+ }
+ else {
+ $encoding = 'ascii';
+ }
+
+ # Match Python .decode()???s 'ignore' behavior
+ $checkval ||= \&_encode_check_ignore;
+
+ return ( $main, $sub, $encoding, $checkval );
+}
+
+#----------------------------------------------------------------------
+
+=head2 @lines = splitlines( $TEXT )
+
+Imitates C<str.splitlines()>. (cf. C<pydoc str>)
+
+Returns a plain list in list context. Returns the number of
+items to be returned in scalar context.
+
+=cut
+
+sub splitlines {
+ return split m<\r\n?|\n>, $_[0];
+}
+
+1;
diff --git a/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
new file mode 100644
index 0000000..2617b4a
--- /dev/null
+++ b/lib/Mail/SpamAssassin/Pyzor/Digest/StripHtml.pm
@@ -0,0 +1,177 @@
+package Mail::SpamAssassin::Pyzor::Digest::StripHtml;
+
+# Copyright 2018 cPanel, LLC.
+# All rights reserved.
+# http://cpanel.net
+#
+# <@LICENSE>
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to you under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at:
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# </@LICENSE>
+#
+
+use strict;
+use warnings;
+
+=encoding utf-8
+
+=head1 NAME
+
+Mail::SpamAssassin::Pyzor::Digest::StripHtml
+
+=head1 SYNOPSIS
+
+ my $stripped = Mail::SpamAssassin::Pyzor::Digest::StripHtml::strip($html);
+
+=head1 DESCRIPTION
+
+This module attempts to duplicate pyzor???s HTML-stripping logic.
+
+=head1 ACCURACY
+
+This library cannot achieve 100%, bug-for-bug parity with pyzor
+because to do so would require duplicating Python???s own HTML parsing
+library. Since that library???s output has changed over time, and those
+changes in turn affect pyzor, it???s literally impossible to arrive at
+a single, fully-compatible reimplementation.
+
+That said, all known divergences between pyzor and this library involve
+invalid HTML as input.
+
+Please open bug reports for any divergences you identify, particularly
+if the input is valid HTML.
+
+=cut
+
+#----------------------------------------------------------------------
+
+use HTML::Parser ();
+
+our $VERSION = '0.03';
+
+#----------------------------------------------------------------------
+
+=head1 FUNCTIONS
+
+=head2 $stripped = strip( $HTML )
+
+Give it some HTML, and it???ll give back the stripped text.
+
+In B<general>, the stripping consists of removing tags as well as
+C<E<lt>scriptE<gt>> and C<E<lt>styleE<gt>> elements; however, it also
+removes HTML entities.
+
+This tries very hard to duplicate pyzor???s behavior with invalid HTML.
+
+=cut
+
+sub strip {
+ my ($html) = @_;
+
+ $html =~ s<\A\s+><>;
+ $html =~ s<\s+\z><>;
+
+ my $p = HTML::Parser->new( api_version => 3 );
+
+ my @pieces;
+
+ my $accumulate = 1;
+
+ $p->handler(
+ start => sub {
+ my ($tagname) = @_;
+
+ $accumulate = 0 if $tagname eq 'script';
+ $accumulate = 0 if $tagname eq 'style';
+
+ return;
+ },
+ 'tagname',
+ );
+
+ $p->handler(
+ end => sub {
+ $accumulate = 1;
+ return;
+ }
+ );
+
+ $p->handler(
+ text => sub {
+ my ($copy) = @_;
+
+ return if !$accumulate;
+
+ # pyzor???s HTML parser discards HTML entities. On top of that,
+ # we need to match, as closely as possible, pyzor???s handling of
+ # invalid HTML entities ??? which is a function of Python???s
+ # standard HTML parsing library. This will probably never be
+ # fully compatible with the pyzor, but we can get it close.
+
+ # The original is:
+ #
+ # re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
+ #
+ # The parsing loop then ???backs up??? one byte if the last
+ # character isn???t a ???;???. We use a look-ahead assertion to
+ # mimic that behavior.
+ $copy =~ s<\&\# (?:[0-9]+ | [xX][0-9a-fA-F]+) (?: ; | \z | (?=[^0-9a-fA-F])
)>< >gx;
+
+ # The original is:
+ #
+ # re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
+ #
+ # We again use a look-ahead assertion to mimic Python.
+ $copy =~ s<\& [a-zA-Z] [-.a-zA-Z0-9]* (?: ; | \z | (?=[^a-zA-Z0-9])
)>< >gx;
+
+ # Python???s HTMLParser aborts its parsing loop when it encounters
+ # an invalid numeric reference.
+ $copy =~ s<\&\#
+ (?:
+ [^0-9xX] # anything but the expected first char
+ |
+ [0-9]+[a-fA-F] # hex within decimal
+ |
+ [xX][^0-9a-fA-F]
+ )
+ (.*)
+ ><
+ ( -1 == index($1, ';') ) ? q<> : '&#'
+ >exs;
+
+ # Python???s HTMLParser treats invalid entities as incomplete
+ $copy =~ s<(\&\#?)><$1 >gx;
+
+ $copy =~ s<\A\s+><>;
+ $copy =~ s<\s+\z><>;
+
+ push @pieces, \$copy if length $copy;
+ },
+ 'text,tagname',
+ );
+
+ $p->parse($html);
+ $p->eof();
+
+ my $payload = join( q< >, map { $$_ } @pieces );
+
+ # Convert all sequences of whitespace OTHER THAN non-breaking spaces to
+ # plain spaces.
+ $payload =~ s<[^\S\x{a0}]+>< >g;
+
+ return $payload;
+}
+
+1;
diff --git a/t/pyzor.t b/t/pyzor.t
index 891f38d..e4ef83f 100755
--- a/t/pyzor.t
+++ b/t/pyzor.t
@@ -3,12 +3,9 @@
use lib '.'; use lib 't';
use SATest; sa_t_init("pyzor");
-use constant HAS_PYZOR => eval { $_ = untaint_cmd("which pyzor"); chomp; -x };
-
use Test::More;
plan skip_all => "Net tests disabled" unless conf_bool('run_net_tests');
-plan skip_all => "Pyzor executable not found in path" unless HAS_PYZOR;
-plan tests => 8;
+plan tests => 5;
diag('Note: Failures may not be an SpamAssassin bug, as Pyzor tests can fail due to problems with the Pyzor servers.');
@@ -30,7 +27,7 @@ tstprefs ("
sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
ok_all_patterns();
# Same with fork
-sarun ("--cf='pyzor_fork 1' -t < data/spam/pyzor", \&patterns_run_cb);
+sarun ("-t < data/spam/pyzor", \&patterns_run_cb);
ok_all_patterns();
#TESTING FOR HAM
@@ -44,7 +41,3 @@ ok_all_patterns();
sarun ("-D pyzor -t < data/nice/001 2>&1", \&patterns_run_cb);
ok_all_patterns();
-# same with fork
-sarun ("-D pyzor --cf='pyzor_fork 1' -t < data/nice/001 2>&1",
\&patterns_run_cb);
-ok_all_patterns();
-