File xapian-fix-snippet-generator-CJK.patch of Package xapian-core
Index: xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc
===================================================================
--- xapian-core-1fmwheezy28569.orig/queryparser/snippetgenerator_internal.cc 2013-05-24 07:12:27.000000000 +0000
+++ xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc 2013-05-24 07:12:38.830056601 +0000
@@ -44,8 +44,16 @@ SnippetGenerator::Internal::add_match(co
matches.insert(stem);
}
+// 'ngram_len' is the length in *characters* (not octets) of an N-gram,
+// or 0 if 'term' is a complete term. We use this to detect when we
+// are being passed N-grams. We also rely on the behaviour of the
+// CJKTokenizer class which returns N-grams in increasing length order,
+// so that when ngram_len==1 we know the N-gram base position just
+// incremented.
+
void
-SnippetGenerator::Internal::accept_term(const string & term, termcount pos)
+SnippetGenerator::Internal::accept_term(const string & term,
+ termcount pos, int ngram_len)
{
string stem = Unicode::tolower(term);
if (stemmer.internal.get())
@@ -55,13 +63,17 @@ SnippetGenerator::Internal::accept_term(
if (pos > (lastpos+2)) {
context.clear();
leading_nonword = "";
+ pending_1gram = "";
+ ignore_1grams = 0;
}
+ if (ngram_len <= 1)
+ xpos += (pos - lastpos);
lastpos = pos;
nwhitespace = 0;
if (matches.find(stem) != matches.end()) {
// found a match
- if (pos > horizon + context.size() + 1 && result != "") {
+ if (xpos > horizon + context.size() + 1 && result != "") {
// there was a gap from the end of the context after
// the previous snippet, so start a new snippet
result += inter_snippet;
@@ -70,6 +82,11 @@ SnippetGenerator::Internal::accept_term(
}
leading_nonword = "";
+ if (ngram_len == 1 && pending_1gram != "") {
+ push_context(pending_1gram);
+ pending_1gram = "";
+ }
+
// flush the before-context
while (!context.empty()) {
result += context.front();
@@ -81,15 +98,39 @@ SnippetGenerator::Internal::accept_term(
result += term;
result += post_match;
+ // some following 1-grams may be included in the
+ // match text, so don't add them to context.
+ ignore_1grams = (ngram_len > 1 ? ngram_len-1 : 0);
+
// set the horizon to mark the end of the after-context
- horizon = pos + context_length;
- } else if (pos <= horizon) {
+ horizon = xpos + context_length + ignore_1grams;
+ } else if (xpos <= horizon) {
// the after-context for a match
- result += term;
+ if (ngram_len == 0) {
+ result += term;
+ } else if (ngram_len == 1) {
+ if (ignore_1grams)
+ ignore_1grams--;
+ else
+ result += term;
+ }
+ // don't keep N>1 N-grams in context, they're redundant
} else {
// not explicitly in a context, but remember the
// term in the context queue for later
- push_context(term);
+ if (ngram_len == 0) {
+ push_context(term);
+ } else if (ngram_len == 1) {
+ if (pending_1gram != "") {
+ push_context(pending_1gram);
+ pending_1gram = "";
+ }
+ if (ignore_1grams)
+ ignore_1grams--;
+ else
+ pending_1gram = term;
+ }
+ // don't keep N>1 N-grams in context, they're redundant
}
}
@@ -111,6 +152,7 @@ SnippetGenerator::Internal::accept_nonwo
Unicode::append_utf8(leading_nonword, ch);
return;
}
+ xpos += (pos - lastpos);
if (Unicode::is_whitespace(ch)) {
if (++nwhitespace > 1)
@@ -120,13 +162,19 @@ SnippetGenerator::Internal::accept_nonwo
nwhitespace = 0;
}
+ if (pending_1gram != "") {
+ push_context(pending_1gram);
+ pending_1gram = "";
+ }
+ ignore_1grams = 0;
+
if (!pos) {
// non-word characters before the first word
Unicode::append_utf8(leading_nonword, ch);
}
- else if (pos <= horizon) {
+ else if (xpos <= horizon) {
// the last word of the after-context of a snippet
- if (ch == ' ' && pos == horizon) {
+ if (ch == ' ' && xpos == horizon) {
// after-context ends on first whitespace
// after the last word in the horizon,
// ...unless another one abuts it, but we
@@ -261,7 +309,7 @@ SnippetGenerator::Internal::accept_text(
if (cjk_token.size() > MAX_PROB_TERM_LENGTH) continue;
// Add unstemmed form positional information.
- accept_term(cjk_token, ++termpos);
+ accept_term(cjk_token, ++termpos, tk.get_length());
}
while (true) {
if (itor == Utf8Iterator()) return;
@@ -318,7 +366,7 @@ SnippetGenerator::Internal::accept_text(
endofterm:
if (term.size() > MAX_PROB_TERM_LENGTH) continue;
- accept_term(term, ++termpos);
+ accept_term(term, ++termpos, 0);
}
}
@@ -329,11 +377,14 @@ SnippetGenerator::Internal::reset()
result = "";
horizon = 0;
lastpos = 0;
+ xpos = 0;
nwhitespace = 0;
context.clear();
matches.clear();
termpos = 0;
leading_nonword = "";
+ pending_1gram = "";
+ ignore_1grams = 0;
}
}
Index: xapian-core-1fmwheezy28569/tests/termgentest.cc
===================================================================
--- xapian-core-1fmwheezy28569.orig/tests/termgentest.cc 2013-05-24 07:01:21.000000000 +0000
+++ xapian-core-1fmwheezy28569/tests/termgentest.cc 2013-05-24 07:14:13.617074471 +0000
@@ -951,6 +951,51 @@ static bool test_sg_first_nonword()
return true;
}
+static bool test_sg_cjk_punctuation()
+{
+ Xapian::SnippetGenerator snipgen;
+ snipgen.set_context_length(3);
+ Xapian::Stem stemmer("en");
+ snipgen.set_stemmer(stemmer);
+
+ std::string text("申込み殺到!Nexus7が0円!WiMAX月額3,770円使い放題とセットでお得");
+
+ // the text contains U+FF01 FULLWIDTH EXCLAMATION MARK which
+ // is both a CJK character and a non-word character; it should
+ // be handled as non-word text and appear in both the before
+ // and after contexts
+ snipgen.reset();
+ snipgen.add_match("Nexus7");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "み殺到!<b>Nexus7</b>が0円!");
+
+ return true;
+}
+
+static bool test_sg_cjk_ngrams()
+{
+ Xapian::SnippetGenerator snipgen;
+ snipgen.set_context_length(3);
+ Xapian::Stem stemmer("en");
+ snipgen.set_stemmer(stemmer);
+
+ std::string text("申込み殺到!Nexus7が0円!WiMAX月額3,770円使い放題とセットでお得");
+
+ // a one-character CJK search term should be matched
+ snipgen.reset();
+ snipgen.add_match("放");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "円使い<b>放</b>題とセ");
+
+ // a two-character CJK search term should be matched
+ snipgen.reset();
+ snipgen.add_match("放題");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "円使い<b>放題</b>とセッ");
+
+ return true;
+}
+
static bool test_sg_stem()
{
Xapian::SnippetGenerator snipgen;
@@ -992,6 +1037,8 @@ static const test_desc tests[] = {
TESTCASE(snipgen1),
TESTCASE(sg_first_nonword),
TESTCASE(sg_stem),
+ TESTCASE(sg_cjk_punctuation),
+ TESTCASE(sg_cjk_ngrams),
END_OF_TESTCASES
};
Index: xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.h
===================================================================
--- xapian-core-1fmwheezy28569.orig/queryparser/snippetgenerator_internal.h 2013-05-24 07:12:27.000000000 +0000
+++ xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.h 2013-05-24 07:12:38.830056601 +0000
@@ -40,16 +40,19 @@ class SnippetGenerator::Internal : publi
std::tr1::unordered_set<std::string> matches;
unsigned nwhitespace;
std::string leading_nonword;
+ std::string pending_1gram;
+ unsigned ignore_1grams;
termcount horizon;
termcount lastpos;
+ termcount xpos; // doesn't count N>1 N-grams
std::deque<std::string> context;
std::string result;
termcount termpos;
void push_context(const std::string & term);
- void accept_term(const std::string & term, termcount pos);
+ void accept_term(const std::string & term, termcount pos, int ngram_len);
void accept_nonword_char(unsigned ch, termcount pos);
public: