File xapian-fix-snippet-generator-CJK.patch of Package xapian-core

Index: xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc
===================================================================
--- xapian-core-1fmwheezy28569.orig/queryparser/snippetgenerator_internal.cc	2013-05-24 07:12:27.000000000 +0000
+++ xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc	2013-05-24 07:12:38.830056601 +0000
@@ -44,8 +44,16 @@ SnippetGenerator::Internal::add_match(co
     matches.insert(stem);
 }
 
+// 'ngram_len' is the length in *characters* (not octets) of an N-gram,
+// or 0 if 'term' is a complete term.  We use this to detect when we
+// are being passed N-grams.  We also rely on the behaviour of the
+// CJKTokenizer class which returns N-grams in increasing length order,
+// so that when ngram_len==1 we know the N-gram base position just
+// incremented.
+
 void
-SnippetGenerator::Internal::accept_term(const string & term, termcount pos)
+SnippetGenerator::Internal::accept_term(const string & term,
+					termcount pos, int ngram_len)
 {
     string stem = Unicode::tolower(term);
     if (stemmer.internal.get())
@@ -55,13 +63,17 @@ SnippetGenerator::Internal::accept_term(
     if (pos > (lastpos+2)) {
 	context.clear();
 	leading_nonword = "";
+	pending_1gram = "";
+	ignore_1grams = 0;
     }
+    if (ngram_len <= 1)
+	xpos += (pos - lastpos);
     lastpos = pos;
     nwhitespace = 0;
 
     if (matches.find(stem) != matches.end()) {
 	// found a match
-	if (pos > horizon + context.size() + 1 && result != "") {
+	if (xpos > horizon + context.size() + 1 && result != "") {
 	    // there was a gap from the end of the context after
 	    // the previous snippet, so start a new snippet
 	    result += inter_snippet;
@@ -70,6 +82,11 @@ SnippetGenerator::Internal::accept_term(
 	}
 	leading_nonword = "";
 
+	if (ngram_len == 1 && pending_1gram != "") {
+	    push_context(pending_1gram);
+	    pending_1gram = "";
+	}
+
 	// flush the before-context
 	while (!context.empty()) {
 	    result += context.front();
@@ -81,15 +98,39 @@ SnippetGenerator::Internal::accept_term(
 	result += term;
 	result += post_match;
 
+	// some following 1-grams may be included in the
+	// match text, so don't add them to context.
+	ignore_1grams = (ngram_len > 1 ? ngram_len-1 : 0);
+
 	// set the horizon to mark the end of the after-context
-	horizon = pos + context_length;
-    } else if (pos <= horizon) {
+	horizon = xpos + context_length + ignore_1grams;
+    } else if (xpos <= horizon) {
 	// the after-context for a match
-	result += term;
+	if (ngram_len == 0) {
+	    result += term;
+	} else if (ngram_len == 1) {
+	    if (ignore_1grams)
+		ignore_1grams--;
+	    else
+		result += term;
+	}
+	// don't keep N>1 N-grams in context, they're redundant
     } else {
 	// not explicitly in a context, but remember the
 	// term in the context queue for later
-	push_context(term);
+	if (ngram_len == 0) {
+	    push_context(term);
+	} else if (ngram_len == 1) {
+	    if (pending_1gram != "") {
+		push_context(pending_1gram);
+		pending_1gram = "";
+	    }
+	    if (ignore_1grams)
+		ignore_1grams--;
+	    else
+		pending_1gram = term;
+	}
+	// don't keep N>1 N-grams in context, they're redundant
     }
 }
 
@@ -111,6 +152,7 @@ SnippetGenerator::Internal::accept_nonwo
 	Unicode::append_utf8(leading_nonword, ch);
 	return;
     }
+    xpos += (pos - lastpos);
 
     if (Unicode::is_whitespace(ch)) {
 	if (++nwhitespace > 1)
@@ -120,13 +162,19 @@ SnippetGenerator::Internal::accept_nonwo
 	nwhitespace = 0;
     }
 
+    if (pending_1gram != "") {
+	push_context(pending_1gram);
+	pending_1gram = "";
+    }
+    ignore_1grams = 0;
+
     if (!pos) {
 	// non-word characters before the first word
 	Unicode::append_utf8(leading_nonword, ch);
     }
-    else if (pos <= horizon) {
+    else if (xpos <= horizon) {
 	// the last word of the after-context of a snippet
-	if (ch == ' ' && pos == horizon) {
+	if (ch == ' ' && xpos == horizon) {
 	    // after-context ends on first whitespace
 	    // after the last word in the horizon,
 	    // ...unless another one abuts it, but we
@@ -261,7 +309,7 @@ SnippetGenerator::Internal::accept_text(
 		    if (cjk_token.size() > MAX_PROB_TERM_LENGTH) continue;
 
 		    // Add unstemmed form positional information.
-		    accept_term(cjk_token, ++termpos);
+		    accept_term(cjk_token, ++termpos, tk.get_length());
 		}
 		while (true) {
 		    if (itor == Utf8Iterator()) return;
@@ -318,7 +366,7 @@ SnippetGenerator::Internal::accept_text(
 
 endofterm:
 	if (term.size() > MAX_PROB_TERM_LENGTH) continue;
-	accept_term(term, ++termpos);
+	accept_term(term, ++termpos, 0);
     }
 }
 
@@ -329,11 +377,14 @@ SnippetGenerator::Internal::reset()
     result = "";
     horizon = 0;
     lastpos = 0;
+    xpos = 0;
     nwhitespace = 0;
     context.clear();
     matches.clear();
     termpos = 0;
     leading_nonword = "";
+    pending_1gram = "";
+    ignore_1grams = 0;
 }
 
 }
Index: xapian-core-1fmwheezy28569/tests/termgentest.cc
===================================================================
--- xapian-core-1fmwheezy28569.orig/tests/termgentest.cc	2013-05-24 07:01:21.000000000 +0000
+++ xapian-core-1fmwheezy28569/tests/termgentest.cc	2013-05-24 07:14:13.617074471 +0000
@@ -951,6 +951,51 @@ static bool test_sg_first_nonword()
     return true;
 }
 
+static bool test_sg_cjk_punctuation()
+{
+    Xapian::SnippetGenerator snipgen;
+    snipgen.set_context_length(3);
+    Xapian::Stem stemmer("en");
+    snipgen.set_stemmer(stemmer);
+
+    std::string text("申込み殺到!Nexus7が0円!WiMAX月額3,770円使い放題とセットでお得");
+
+    // the text contains U+FF01 FULLWIDTH EXCLAMATION MARK which
+    // is both a CJK character and a non-word character; it should
+    // be handled as non-word text and appear in both the before
+    // and after contexts
+    snipgen.reset();
+    snipgen.add_match("Nexus7");
+    snipgen.accept_text(text);
+    TEST_EQUAL(snipgen.get_snippets(), "み殺到!<b>Nexus7</b>が0円!");
+
+    return true;
+}
+
+static bool test_sg_cjk_ngrams()
+{
+    Xapian::SnippetGenerator snipgen;
+    snipgen.set_context_length(3);
+    Xapian::Stem stemmer("en");
+    snipgen.set_stemmer(stemmer);
+
+    std::string text("申込み殺到!Nexus7が0円!WiMAX月額3,770円使い放題とセットでお得");
+
+    // a one-character CJK search term should be matched
+    snipgen.reset();
+    snipgen.add_match("放");
+    snipgen.accept_text(text);
+    TEST_EQUAL(snipgen.get_snippets(), "円使い<b>放</b>題とセ");
+
+    // a two-character CJK search term should be matched
+    snipgen.reset();
+    snipgen.add_match("放題");
+    snipgen.accept_text(text);
+    TEST_EQUAL(snipgen.get_snippets(), "円使い<b>放題</b>とセッ");
+
+    return true;
+}
+
 static bool test_sg_stem()
 {
     Xapian::SnippetGenerator snipgen;
@@ -992,6 +1037,8 @@ static const test_desc tests[] = {
     TESTCASE(snipgen1),
     TESTCASE(sg_first_nonword),
     TESTCASE(sg_stem),
+    TESTCASE(sg_cjk_punctuation),
+    TESTCASE(sg_cjk_ngrams),
     END_OF_TESTCASES
 };
 
Index: xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.h
===================================================================
--- xapian-core-1fmwheezy28569.orig/queryparser/snippetgenerator_internal.h	2013-05-24 07:12:27.000000000 +0000
+++ xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.h	2013-05-24 07:12:38.830056601 +0000
@@ -40,16 +40,19 @@ class SnippetGenerator::Internal : publi
     std::tr1::unordered_set<std::string> matches;
     unsigned nwhitespace;
     std::string leading_nonword;
+    std::string pending_1gram;
+    unsigned ignore_1grams;
 
     termcount horizon;
     termcount lastpos;
+    termcount xpos;	// doesn't count N>1 N-grams
     std::deque<std::string> context;
     std::string result;
 
     termcount termpos;
 
     void push_context(const std::string & term);
-    void accept_term(const std::string & term, termcount pos);
+    void accept_term(const std::string & term, termcount pos, int ngram_len);
     void accept_nonword_char(unsigned ch, termcount pos);
 
   public: