File xapian-fix-snippet-generator-CJK.patch of Package xapian-core

Index: xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc
===================================================================
--- xapian-core-1fmwheezy28569.orig/queryparser/snippetgenerator_internal.cc	2013-05-24 07:12:27.000000000 +0000
+++ xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc	2013-05-24 07:12:38.830056601 +0000
@@ -44,8 +44,16 @@ SnippetGenerator::Internal::add_match(co
 matches.insert(stem);
 }
 
+// 'ngram_len' is the length in *characters* (not octets) of an N-gram,
+// or 0 if 'term' is a complete term. We use this to detect when we
+// are being passed N-grams. We also rely on the behaviour of the
+// CJKTokenizer class which returns N-grams in increasing length order,
+// so that when ngram_len==1 we know the N-gram base position just
+// incremented.
+
 void
-SnippetGenerator::Internal::accept_term(const string & term, termcount pos)
+SnippetGenerator::Internal::accept_term(const string & term,
+					termcount pos, int ngram_len)
 {
 string stem = Unicode::tolower(term);
 if (stemmer.internal.get())
@@ -55,13 +63,17 @@ SnippetGenerator::Internal::accept_term(
 if (pos > (lastpos+2)) {
 	context.clear();
 	leading_nonword = "";
+	pending_1gram = "";
+	ignore_1grams = 0;
 }
+ if (ngram_len <= 1)
+	xpos += (pos - lastpos);
 lastpos = pos;
 nwhitespace = 0;
 
 if (matches.find(stem) != matches.end()) {
 	// found a match
-	if (pos > horizon + context.size() + 1 && result != "") {
+	if (xpos > horizon + context.size() + 1 && result != "") {
 	 // there was a gap from the end of the context after
 	 // the previous snippet, so start a new snippet
 	 result += inter_snippet;
@@ -70,6 +82,11 @@ SnippetGenerator::Internal::accept_term(
 	}
 	leading_nonword = "";
 
+	if (ngram_len == 1 && pending_1gram != "") {
+	 push_context(pending_1gram);
+	 pending_1gram = "";
+	}
+
 	// flush the before-context
 	while (!context.empty()) {
 	 result += context.front();
@@ -81,15 +98,39 @@ SnippetGenerator::Internal::accept_term(
 	result += term;
 	result += post_match;
 
+	// some following 1-grams may be included in the
+	// match text, so don't add them to context.
+	ignore_1grams = (ngram_len > 1 ? ngram_len-1 : 0);
+
 	// set the horizon to mark the end of the after-context
-	horizon = pos + context_length;
- } else if (pos <= horizon) {
+	horizon = xpos + context_length + ignore_1grams;
+ } else if (xpos <= horizon) {
 	// the after-context for a match
-	result += term;
+	if (ngram_len == 0) {
+	 result += term;
+	} else if (ngram_len == 1) {
+	 if (ignore_1grams)
+		ignore_1grams--;
+	 else
+		result += term;
+	}
+	// don't keep N>1 N-grams in context, they're redundant
 } else {
 	// not explicitly in a context, but remember the
 	// term in the context queue for later
-	push_context(term);
+	if (ngram_len == 0) {
+	 push_context(term);
+	} else if (ngram_len == 1) {
+	 if (pending_1gram != "") {
+		push_context(pending_1gram);
+		pending_1gram = "";
+	 }
+	 if (ignore_1grams)
+		ignore_1grams--;
+	 else
+		pending_1gram = term;
+	}
+	// don't keep N>1 N-grams in context, they're redundant
 }
 }
 
@@ -111,6 +152,7 @@ SnippetGenerator::Internal::accept_nonwo
 	Unicode::append_utf8(leading_nonword, ch);
 	return;
 }
+ xpos += (pos - lastpos);
 
 if (Unicode::is_whitespace(ch)) {
 	if (++nwhitespace > 1)
@@ -120,13 +162,19 @@ SnippetGenerator::Internal::accept_nonwo
 	nwhitespace = 0;
 }
 
+ if (pending_1gram != "") {
+	push_context(pending_1gram);
+	pending_1gram = "";
+ }
+ ignore_1grams = 0;
+
 if (!pos) {
 	// non-word characters before the first word
 	Unicode::append_utf8(leading_nonword, ch);
 }
- else if (pos <= horizon) {
+ else if (xpos <= horizon) {
 	// the last word of the after-context of a snippet
-	if (ch == ' ' && pos == horizon) {
+	if (ch == ' ' && xpos == horizon) {
 	 // after-context ends on first whitespace
 	 // after the last word in the horizon,
 	 // ...unless another one abuts it, but we
@@ -261,7 +309,7 @@ SnippetGenerator::Internal::accept_text(
 		 if (cjk_token.size() > MAX_PROB_TERM_LENGTH) continue;
 
 		 // Add unstemmed form positional information.
-		 accept_term(cjk_token, ++termpos);
+		 accept_term(cjk_token, ++termpos, tk.get_length());
 		}
 		while (true) {
 		 if (itor == Utf8Iterator()) return;
@@ -318,7 +366,7 @@ SnippetGenerator::Internal::accept_text(
 
 endofterm:
 	if (term.size() > MAX_PROB_TERM_LENGTH) continue;
-	accept_term(term, ++termpos);
+	accept_term(term, ++termpos, 0);
 }
 }
 
@@ -329,11 +377,14 @@ SnippetGenerator::Internal::reset()
 result = "";
 horizon = 0;
 lastpos = 0;
+ xpos = 0;
 nwhitespace = 0;
 context.clear();
 matches.clear();
 termpos = 0;
 leading_nonword = "";
+ pending_1gram = "";
+ ignore_1grams = 0;
 }
 
 }
Index: xapian-core-1fmwheezy28569/tests/termgentest.cc
===================================================================
--- xapian-core-1fmwheezy28569.orig/tests/termgentest.cc	2013-05-24 07:01:21.000000000 +0000
+++ xapian-core-1fmwheezy28569/tests/termgentest.cc	2013-05-24 07:14:13.617074471 +0000
@@ -951,6 +951,51 @@ static bool test_sg_first_nonword()
 return true;
 }
 
+static bool test_sg_cjk_punctuation()
+{
+ Xapian::SnippetGenerator snipgen;
+ snipgen.set_context_length(3);
+ Xapian::Stem stemmer("en");
+ snipgen.set_stemmer(stemmer);
+
+ std::string text("申込み殺到！Nexus7が0円！WiMAX月額3,770円使い放題とセットでお得");
+
+ // the text contains U+FF01 FULLWIDTH EXCLAMATION MARK which
+ // is both a CJK character and a non-word character; it should
+ // be handled as non-word text and appear in both the before
+ // and after contexts
+ snipgen.reset();
+ snipgen.add_match("Nexus7");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "み殺到！Nexus7が0円！");
+
+ return true;
+}
+
+static bool test_sg_cjk_ngrams()
+{
+ Xapian::SnippetGenerator snipgen;
+ snipgen.set_context_length(3);
+ Xapian::Stem stemmer("en");
+ snipgen.set_stemmer(stemmer);
+
+ std::string text("申込み殺到！Nexus7が0円！WiMAX月額3,770円使い放題とセットでお得");
+
+ // a one-character CJK search term should be matched
+ snipgen.reset();
+ snipgen.add_match("放");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "円使い放題とセ");
+
+ // a two-character CJK search term should be matched
+ snipgen.reset();
+ snipgen.add_match("放題");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "円使い放題とセッ");
+
+ return true;
+}
+
 static bool test_sg_stem()
 {
 Xapian::SnippetGenerator snipgen;
@@ -992,6 +1037,8 @@ static const test_desc tests[] = {
 TESTCASE(snipgen1),
 TESTCASE(sg_first_nonword),
 TESTCASE(sg_stem),
+ TESTCASE(sg_cjk_punctuation),
+ TESTCASE(sg_cjk_ngrams),
 END_OF_TESTCASES
 };
 
Index: xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.h
===================================================================
--- xapian-core-1fmwheezy28569.orig/queryparser/snippetgenerator_internal.h	2013-05-24 07:12:27.000000000 +0000
+++ xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.h	2013-05-24 07:12:38.830056601 +0000
@@ -40,16 +40,19 @@ class SnippetGenerator::Internal : publi
 std::tr1::unordered_set<std::string> matches;
 unsigned nwhitespace;
 std::string leading_nonword;
+ std::string pending_1gram;
+ unsigned ignore_1grams;
 
 termcount horizon;
 termcount lastpos;
+ termcount xpos;	// doesn't count N>1 N-grams
 std::deque<std::string> context;
 std::string result;
 
 termcount termpos;
 
 void push_context(const std::string & term);
- void accept_term(const std::string & term, termcount pos);
+ void accept_term(const std::string & term, termcount pos, int ngram_len);
 void accept_nonword_char(unsigned ch, termcount pos);
 
 public: