fix sphinx xml utf8 related issure, filter these unicode control characters, only backup delta file if the operation failed

src
Kevin Lynx 11 years ago
parent 1d870e2e42
commit 79291ab4e9

@ -3,8 +3,33 @@
%% Kevin Lynx
%%
-module(string_util).
-export([format/2]).
-compile(export_all).
-export([format/2, strip_invalid_unicode/1]).
format(Fmt, Arg) when is_list(Fmt), is_list(Arg) ->
lists:flatten(io_lib:format(Fmt, Arg)).
% strip these unicode control characters
strip_invalid_unicode(L) when is_list(L) ->
binary_to_list(strip_invalid_unicode(list_to_binary(L)));
strip_invalid_unicode(<<>>) ->
<<>>;
strip_invalid_unicode(<<C/utf8, R/binary>>) ->
case is_valid_unicode(C) of
true ->
RR = strip_invalid_unicode(R),
<<C/utf8, RR/binary>>;
false ->
strip_invalid_unicode(R)
end;
strip_invalid_unicode(<<_, R/binary>>) ->
strip_invalid_unicode(R).
is_valid_unicode(C) when C < 16#20 ->
false;
is_valid_unicode(C) when C >= 16#7f, C =< 16#ff ->
false;
is_valid_unicode(_) ->
true.

@ -18,10 +18,9 @@ search(Conn, Key, Offset, Count) ->
{T2, TDocs} = case catch giza_request:send(Q4) of
{'EXIT', R} ->
?W(?FMT("sphinx search error ~p", [R])),
[];
{now(), []};
{ok, Ret} ->
T = now(),
{T, decode_search_ret(Conn, Ret)}
{now(), decode_search_ret(Conn, Ret)}
end,
T3 = now(),
Stats = {timer:now_diff(T2, T1), timer:now_diff(T3, T2)},

@ -27,7 +27,8 @@ do_build_init_index(MainFile, DeltaFile, CfgFile) ->
build_delta_index(IndexFile, Delta, CfgFile, MinID, MaxID) ->
Cmd = "indexer -c " ++ CfgFile ++ " --rotate " ++ Delta,
Res = os:cmd(Cmd),
Dest = backup_delta_file(Delta, MinID, MaxID, IndexFile),
Success = check_cmd_success(Res),
Dest = backup_delta_file(Delta, MinID, MaxID, IndexFile, Success),
?I(?FMT("command `~s' result on ~s~n" ++ Res, [Cmd, Dest])).
merge_index(Main, Delta, CfgFile) ->
@ -36,9 +37,13 @@ merge_index(Main, Delta, CfgFile) ->
Res = os:cmd(Cmd),
?I(?FMT("command `~s' result~n" ++ Res, [Cmd])).
backup_delta_file(Delta, MinID, MaxID, IndexFile) ->
backup_delta_file(Delta, MinID, MaxID, IndexFile, Flag) ->
Path = filename:dirname(IndexFile),
Dest = string_util:format(Path ++ "/" ++ Delta ++ "[~b-~b]" ++ ".xml",
[MinID, MaxID]),
file:copy(IndexFile, Dest),
if not Flag -> file:copy(IndexFile, Dest); true -> skip end,
Dest.
% too simple
check_cmd_success(Res) ->
string:str(Res, "succesfully") > 0.

@ -5,6 +5,7 @@
%%
-module(sphinx_xml).
-behaviour(gen_server).
-compile(export_all).
-include("vlog.hrl").
-export([init/1,
handle_call/3,
@ -48,9 +49,9 @@ handle_cast(save, #state{docs = Docs, ids = IDs} = State) when length(Docs) > 0
handle_cast(stop, State) ->
{stop, normal, State}.
handle_call({insert, {ID, Hash, Name, Files, Query, CreatedAt}}, _From, State) ->
handle_call({insert, DocT}, _From, State) ->
#state{docs = Docs, ids = IDs, max = Max} = State,
Doc = sphinx_doc:element(Hash, Name, Files, ID, Query, CreatedAt),
{ID, Doc} = create_doc(DocT),
{NewDocs, NewIDs} = try_save([Doc|Docs], Max, [ID|IDs]),
{reply, ok, State#state{docs = NewDocs, ids = NewIDs}};
@ -83,3 +84,20 @@ get_id_range([First|IDs]) ->
lists:foldl(fun(ID, {Min, Max}) ->
{min(ID, Min), max(ID, Max)}
end, {First, First}, IDs).
create_doc({ID, Hash, Name, Files, Query, CreatedAt}) ->
ValidName = valid_name(Name),
ValidFiles = valid_file_names(Files),
Doc = sphinx_doc:element(Hash, ValidName, ValidFiles, ID, Query, CreatedAt),
{ID, Doc}.
valid_file_names(Files) ->
[{valid_name(Name), Length} || {Name, Length} <- Files].
valid_name(S) ->
ValidName = string_util:strip_invalid_unicode(S),
if length(ValidName) < length(S) ->
?I(?FMT("~s -> ~s", [S, ValidName]));
true -> ok
end,
ValidName.

Loading…
Cancel
Save