1
0
Fork 0
mirror of https://github.com/processone/ejabberd synced 2025-10-06 03:50:15 +02:00

Move spam files parsing to a submodule

This commit is contained in:
Badlop 2025-06-21 23:33:35 +02:00
parent 88ae3fddf3
commit a77c7e36b0
2 changed files with 233 additions and 142 deletions

View file

@ -234,41 +234,33 @@ mod_doc() ->
-spec init(list()) -> {ok, state()} | {stop, term()}.
init([Host, Opts]) ->
process_flag(trap_exit, true),
Files =
#{domains => gen_mod:get_opt(spam_domains_file, Opts),
jid => gen_mod:get_opt(spam_jids_file, Opts),
url => gen_mod:get_opt(spam_urls_file, Opts),
whitelist_domains => gen_mod:get_opt(whitelist_domains_file, Opts)},
try read_files(Files) of
#{jid := JIDsSet,
url := URLsSet,
domains := SpamDomainsSet,
whitelist_domains := WhitelistDomains} ->
ejabberd_hooks:add(local_send_to_resource_hook,
Host,
mod_antispam_rtbl,
pubsub_event_handler,
50),
RTBLHost = gen_mod:get_opt(rtbl_host, Opts),
RTBLDomainsNode = gen_mod:get_opt(rtbl_domains_node, Opts),
mod_antispam_filter:init_filtering(Host),
InitState =
#state{host = Host,
jid_set = JIDsSet,
url_set = URLsSet,
dump_fd = mod_antispam_dump:init_dumping(Host),
max_cache_size = gen_mod:get_opt(cache_size, Opts),
blocked_domains = set_to_map(SpamDomainsSet),
whitelist_domains = set_to_map(WhitelistDomains, false),
rtbl_host = RTBLHost,
rtbl_domains_node = RTBLDomainsNode},
mod_antispam_rtbl:request_blocked_domains(RTBLHost, RTBLDomainsNode, Host),
{ok, InitState}
catch
{Op, File, Reason} when Op == open; Op == read ->
?CRITICAL_MSG("Cannot ~s ~s: ~s", [Op, File, format_error(Reason)]),
{stop, config_error}
end.
mod_antispam_files:init_files(Host),
FilesResults = read_files(Host),
#{jid := JIDsSet,
url := URLsSet,
domains := SpamDomainsSet,
whitelist_domains := WhitelistDomains} =
FilesResults,
ejabberd_hooks:add(local_send_to_resource_hook,
Host,
mod_antispam_rtbl,
pubsub_event_handler,
50),
RTBLHost = gen_mod:get_opt(rtbl_host, Opts),
RTBLDomainsNode = gen_mod:get_opt(rtbl_domains_node, Opts),
mod_antispam_filter:init_filtering(Host),
InitState =
#state{host = Host,
jid_set = JIDsSet,
url_set = URLsSet,
dump_fd = mod_antispam_dump:init_dumping(Host),
max_cache_size = gen_mod:get_opt(cache_size, Opts),
blocked_domains = set_to_map(SpamDomainsSet),
whitelist_domains = set_to_map(WhitelistDomains, false),
rtbl_host = RTBLHost,
rtbl_domains_node = RTBLDomainsNode},
mod_antispam_rtbl:request_blocked_domains(RTBLHost, RTBLDomainsNode, Host),
{ok, InitState}.
-spec handle_call(term(), {pid(), term()}, state()) ->
{reply, {spam_filter, term()}, state()} | {noreply, state()}.
@ -287,8 +279,8 @@ handle_call({check_body, URLs, JIDs, From},
Result2
end,
{reply, {spam_filter, Result}, State2};
handle_call({reload_files, Files}, _From, State) ->
{Result, State1} = reload_files(Files, State),
handle_call(reload_spam_files, _From, State) ->
{Result, State1} = reload_files(State),
{reply, {spam_filter, Result}, State1};
handle_call({expire_cache, Age}, _From, State) ->
{Result, State1} = expire_cache(Age, State),
@ -355,12 +347,7 @@ handle_cast({reload, NewOpts, OldOpts},
State1
end,
ok = mod_antispam_rtbl:unsubscribe(OldRTBLHost, OldRTBLDomainsNode, Host),
Files =
#{domains => gen_mod:get_opt(spam_domains_file, NewOpts),
jid => gen_mod:get_opt(spam_jids_file, NewOpts),
url => gen_mod:get_opt(spam_urls_file, NewOpts),
whitelist_domains => gen_mod:get_opt(whitelist_domains_file, NewOpts)},
{_Result, State3} = reload_files(Files, State2#state{blocked_domains = #{}}),
{_Result, State3} = reload_files(State2#state{blocked_domains = #{}}),
RTBLHost = gen_mod:get_opt(rtbl_host, NewOpts),
RTBLDomainsNode = gen_mod:get_opt(rtbl_domains_node, NewOpts),
ok = mod_antispam_rtbl:request_blocked_domains(RTBLHost, RTBLDomainsNode, Host),
@ -440,6 +427,7 @@ terminate(Reason,
?DEBUG("Stopping spam filter process for ~s: ~p", [Host, Reason]),
misc:cancel_timer(RTBLRetryTimer),
mod_antispam_dump:terminate_dumping(Host, Fd),
mod_antispam_files:terminate_files(Host),
mod_antispam_filter:terminate_filtering(Host),
ejabberd_hooks:delete(local_send_to_resource_hook,
Host,
@ -494,10 +482,9 @@ filter_body({_, Addrs}, Set, From, #state{host = Host} = State) ->
filter_body(none, _Set, _From, State) ->
{ham, State}.
-spec reload_files(#{Type :: atom() => filename()}, state()) ->
{ok | {error, binary()}, state()}.
reload_files(Files, #state{host = Host, blocked_domains = BlockedDomains} = State) ->
try read_files(Files) of
-spec reload_files(state()) -> {ok | {error, binary()}, state()}.
reload_files(#state{host = Host, blocked_domains = BlockedDomains} = State) ->
case read_files(Host) of
#{jid := JIDsSet,
url := URLsSet,
domains := SpamDomainsSet,
@ -518,12 +505,9 @@ reload_files(Files, #state{host = Host, blocked_domains = BlockedDomains} = Stat
State#state{jid_set = JIDsSet,
url_set = URLsSet,
blocked_domains = maps:merge(BlockedDomains, set_to_map(SpamDomainsSet)),
whitelist_domains = set_to_map(WhitelistDomains, false)}}
catch
{Op, File, Reason} when Op == open; Op == read ->
Txt = format("Cannot ~s ~s for ~s: ~s", [Op, File, Host, format_error(Reason)]),
?ERROR_MSG("~s", [Txt]),
{{error, Txt}, State}
whitelist_domains = set_to_map(WhitelistDomains, false)}};
{config_error, ErrorText} ->
{{error, ErrorText}, State}
end.
set_to_map(Set) ->
@ -532,80 +516,18 @@ set_to_map(Set) ->
set_to_map(Set, V) ->
sets:fold(fun(K, M) -> M#{K => V} end, #{}, Set).
-spec read_files(#{Type => filename()}) ->
#{jid => jid_set(),
url => url_set(),
Type => sets:set(binary())}
when Type :: atom().
read_files(Files) ->
maps:map(fun(Type, Filename) -> read_file(Filename, line_parser(Type)) end, Files).
-spec line_parser(Type :: atom()) -> fun((binary()) -> binary()).
line_parser(jid) ->
fun parse_jid/1;
line_parser(url) ->
fun parse_url/1;
line_parser(_) ->
fun trim/1.
-spec read_file(filename(), fun((binary()) -> ljid() | url())) -> jid_set() | url_set().
read_file(none, _ParseLine) ->
sets:new();
read_file(File, ParseLine) ->
case file:open(File, [read, binary, raw, {read_ahead, 65536}]) of
{ok, Fd} ->
try
read_line(Fd, ParseLine, sets:new())
catch
E ->
throw({read, File, E})
after
ok = file:close(Fd)
end;
{error, Reason} ->
throw({open, File, Reason})
end.
-spec read_line(file:io_device(),
fun((binary()) -> ljid() | url()),
jid_set() | url_set()) ->
jid_set() | url_set().
read_line(Fd, ParseLine, Set) ->
case file:read_line(Fd) of
{ok, Line} ->
read_line(Fd, ParseLine, sets:add_element(ParseLine(Line), Set));
{error, Reason} ->
throw(Reason);
eof ->
Set
end.
-spec parse_jid(binary()) -> ljid().
parse_jid(S) ->
try jid:decode(trim(S)) of
#jid{} = JID ->
jid:remove_resource(
jid:tolower(JID))
catch
_:{bad_jid, _} ->
throw({bad_jid, S})
end.
-spec parse_url(binary()) -> url().
parse_url(S) ->
URL = trim(S),
RE = <<"https?://\\S+$">>,
Options = [anchored, caseless, {capture, none}],
case re:run(URL, RE, Options) of
match ->
URL;
nomatch ->
throw({bad_url, S})
end.
-spec trim(binary()) -> binary().
trim(S) ->
re:replace(S, <<"\\s+$">>, <<>>, [{return, binary}]).
read_files(Host) ->
AccInitial =
#{jid => sets:new(),
url => sets:new(),
domains => sets:new(),
whitelist_domains => sets:new()},
Files =
#{jid => gen_mod:get_module_opt(Host, ?MODULE, spam_jids_file),
url => gen_mod:get_module_opt(Host, ?MODULE, spam_urls_file),
domains => gen_mod:get_module_opt(Host, ?MODULE, spam_domains_file),
whitelist_domains => gen_mod:get_module_opt(Host, ?MODULE, whitelist_domains_file)},
ejabberd_hooks:run_fold(antispam_get_lists, Host, AccInitial, [Files]).
-spec get_proc_name(binary()) -> atom().
get_proc_name(Host) ->
@ -623,14 +545,6 @@ sets_equal(A, B) ->
format(Format, Data) ->
iolist_to_binary(io_lib:format(Format, Data)).
-spec format_error(atom() | tuple()) -> binary().
format_error({bad_jid, JID}) ->
<<"Not a valid JID: ", JID/binary>>;
format_error({bad_url, URL}) ->
<<"Not an HTTP(S) URL: ", URL/binary>>;
format_error(Reason) ->
list_to_binary(file:format_error(Reason)).
%%--------------------------------------------------------------------
%%| Caching
@ -792,16 +706,11 @@ try_call_by_host(Host, Call) ->
reload_spam_filter_files(<<"global">>) ->
for_all_hosts(fun reload_spam_filter_files/1, []);
reload_spam_filter_files(Host) ->
LServer = jid:nameprep(Host),
Files =
#{domains => gen_mod:get_module_opt(LServer, ?MODULE, spam_domains_file),
jid => gen_mod:get_module_opt(LServer, ?MODULE, spam_jids_file),
url => gen_mod:get_module_opt(LServer, ?MODULE, spam_urls_file)},
case try_call_by_host(Host, {reload_files, Files}) of
case try_call_by_host(Host, reload_spam_files) of
{spam_filter, ok} ->
ok;
{spam_filter, {error, Txt}} ->
{error, binary_to_list(Txt)};
{error, Txt};
{error, _R} = Error ->
Error
end.

182
src/mod_antispam_files.erl Normal file
View file

@ -0,0 +1,182 @@
%%%----------------------------------------------------------------------
%%% File : mod_antispam_files.erl
%%% Author : Holger Weiss <holger@zedat.fu-berlin.de>
%%% Author : Stefan Strigler <stefan@strigler.de>
%%% Purpose : Filter spam messages based on sender JID and content
%%% Created : 31 Mar 2019 by Holger Weiss <holger@zedat.fu-berlin.de>
%%%
%%%
%%% ejabberd, Copyright (C) 2019-2025 ProcessOne
%%%
%%% This program is free software; you can redistribute it and/or
%%% modify it under the terms of the GNU General Public License as
%%% published by the Free Software Foundation; either version 2 of the
%%% License, or (at your option) any later version.
%%%
%%% This program is distributed in the hope that it will be useful,
%%% but WITHOUT ANY WARRANTY; without even the implied warranty of
%%% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
%%% General Public License for more details.
%%%
%%% You should have received a copy of the GNU General Public License along
%%% with this program; if not, write to the Free Software Foundation, Inc.,
%%% 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
%%%
%%%----------------------------------------------------------------------
%%| definitions
%% @format-begin
-module(mod_antispam_files).
-author('holger@zedat.fu-berlin.de').
-author('stefan@strigler.de').
%% Exported
-export([init_files/1, terminate_files/1]).
% Hooks
-export([get_files_lists/2]).
-include("ejabberd_commands.hrl").
-include("logger.hrl").
-include("mod_antispam.hrl").
-include("translate.hrl").
-include_lib("xmpp/include/xmpp.hrl").
-type files_map() :: #{atom() => filename()}.
-type lists_map() ::
#{jid => jid_set(),
url => url_set(),
atom() => sets:set(binary())}.
-define(COMMAND_TIMEOUT, timer:seconds(30)).
-define(DEFAULT_CACHE_SIZE, 10000).
-define(DEFAULT_RTBL_DOMAINS_NODE, <<"spam_source_domains">>).
-define(HTTPC_TIMEOUT, timer:seconds(3)).
%%--------------------------------------------------------------------
%%| Exported
init_files(Host) ->
ejabberd_hooks:add(antispam_get_lists, Host, ?MODULE, get_files_lists, 50).
terminate_files(Host) ->
ejabberd_hooks:delete(antispam_get_lists, Host, ?MODULE, get_files_lists, 50).
%%--------------------------------------------------------------------
%%| Hooks
-spec get_files_lists(lists_map(), files_map()) -> lists_map().
get_files_lists(#{jid := AccJids,
url := AccUrls,
domains := AccDomains,
whitelist_domains := AccWhitelist} =
Acc,
Files) ->
try read_files(Files) of
#{jid := JIDsSet,
url := URLsSet,
domains := SpamDomainsSet,
whitelist_domains := WhitelistDomains} ->
Acc#{jid => sets:union(AccJids, JIDsSet),
url => sets:union(AccUrls, URLsSet),
domains => sets:union(AccDomains, SpamDomainsSet),
whitelist_domains => sets:union(AccWhitelist, WhitelistDomains)}
catch
{Op, File, Reason} when Op == open; Op == read ->
ErrorText = format("Error trying to ~s file ~s: ~s", [Op, File, format_error(Reason)]),
?CRITICAL_MSG(ErrorText, []),
{stop, {config_error, ErrorText}}
end.
%%--------------------------------------------------------------------
%%| read_files
-spec read_files(files_map()) -> lists_map().
read_files(Files) ->
maps:map(fun(Type, Filename) -> read_file(Filename, line_parser(Type)) end, Files).
-spec line_parser(Type :: atom()) -> fun((binary()) -> binary()).
line_parser(jid) ->
fun parse_jid/1;
line_parser(url) ->
fun parse_url/1;
line_parser(_) ->
fun trim/1.
-spec read_file(filename(), fun((binary()) -> ljid() | url())) -> jid_set() | url_set().
read_file(none, _ParseLine) ->
sets:new();
read_file(File, ParseLine) ->
case file:open(File, [read, binary, raw, {read_ahead, 65536}]) of
{ok, Fd} ->
try
read_line(Fd, ParseLine, sets:new())
catch
E ->
throw({read, File, E})
after
ok = file:close(Fd)
end;
{error, Reason} ->
throw({open, File, Reason})
end.
-spec read_line(file:io_device(),
fun((binary()) -> ljid() | url()),
jid_set() | url_set()) ->
jid_set() | url_set().
read_line(Fd, ParseLine, Set) ->
case file:read_line(Fd) of
{ok, Line} ->
read_line(Fd, ParseLine, sets:add_element(ParseLine(Line), Set));
{error, Reason} ->
throw(Reason);
eof ->
Set
end.
-spec parse_jid(binary()) -> ljid().
parse_jid(S) ->
try jid:decode(trim(S)) of
#jid{} = JID ->
jid:remove_resource(
jid:tolower(JID))
catch
_:{bad_jid, _} ->
throw({bad_jid, S})
end.
-spec parse_url(binary()) -> url().
parse_url(S) ->
URL = trim(S),
RE = <<"https?://\\S+$">>,
Options = [anchored, caseless, {capture, none}],
case re:run(URL, RE, Options) of
match ->
URL;
nomatch ->
throw({bad_url, S})
end.
-spec trim(binary()) -> binary().
trim(S) ->
re:replace(S, <<"\\s+$">>, <<>>, [{return, binary}]).
%% Function copied from mod_antispam.erl
-spec format(io:format(), [term()]) -> binary().
format(Format, Data) ->
iolist_to_binary(io_lib:format(Format, Data)).
-spec format_error(atom() | tuple()) -> binary().
format_error({bad_jid, JID}) ->
<<"Not a valid JID: ", JID/binary>>;
format_error({bad_url, URL}) ->
<<"Not an HTTP(S) URL: ", URL/binary>>;
format_error(Reason) ->
list_to_binary(file:format_error(Reason)).
%%--------------------------------------------------------------------
%%| vim: set foldmethod=marker foldmarker=%%|,%%-: