File xapian-fix-snippet-generator-punctuation-in-match.patch of Package xapian-core

Index: xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc
===================================================================
--- xapian-core-1fmwheezy28569.orig/queryparser/snippetgenerator_internal.cc	2013-05-24 07:12:38.830056601 +0000
+++ xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc	2013-05-24 07:14:31.928112153 +0000
@@ -35,15 +35,6 @@ using namespace std;
 
 namespace Xapian {
 
-void
-SnippetGenerator::Internal::add_match(const std::string & term)
-{
-    string stem = Unicode::tolower(term);
-    if (stemmer.internal.get())
-	stem = stemmer(stem);
-    matches.insert(stem);
-}
-
 // 'ngram_len' is the length in *characters* (not octets) of an N-gram,
 // or 0 if 'term' is a complete term.  We use this to detect when we
 // are being passed N-grams.  We also rely on the behaviour of the
@@ -370,6 +361,48 @@ endofterm:
     }
 }
 
+void
+SnippetGenerator::Internal::add_match(const std::string & str)
+{
+    unsigned ch;
+    Utf8Iterator itor(str);
+    while (true) {
+
+	// skip non-word characters
+	while (true) {
+	    if (itor == Utf8Iterator()) return;
+	    ch = check_wordchar(*itor);
+	    ++itor;
+	    if (ch) break;
+	}
+
+	string term;
+	Unicode::append_utf8(term, ch);
+
+#if 0
+	// treat CJK characters as 1-character terms
+	// this is probably wrong.
+	if (CJK::codepoint_is_cjk(ch)) {
+	    matches.insert(term);
+	    continue;
+	}
+#endif
+
+	// collect word characters into a term
+	while (true) {
+	    if (itor == Utf8Iterator()) break;
+	    ch = check_wordchar(*itor);
+	    if (!ch) break;
+	    Unicode::append_utf8(term, ch);
+	    ++itor;
+	}
+
+	string stem = Unicode::tolower(term);
+	if (stemmer.internal.get())
+	    stem = stemmer(stem);
+	matches.insert(stem);
+    }
+}
 
 void
 SnippetGenerator::Internal::reset()
Index: xapian-core-1fmwheezy28569/tests/termgentest.cc
===================================================================
--- xapian-core-1fmwheezy28569.orig/tests/termgentest.cc	2013-05-24 07:14:13.617074471 +0000
+++ xapian-core-1fmwheezy28569/tests/termgentest.cc	2013-05-24 07:14:31.928112153 +0000
@@ -1028,6 +1028,38 @@ static bool test_sg_stem()
     return true;
 }
 
+static bool test_sg_match_punctuation()
+{
+    Xapian::SnippetGenerator snipgen;
+    snipgen.set_context_length(3);
+    Xapian::Stem stemmer("en");
+    snipgen.set_stemmer(stemmer);
+
+    std::string text("alpha beta gamma delta put-a-bird-on-it epsilon zeta eta theta");
+
+    // a simple word search term should be matched
+    snipgen.reset();
+    snipgen.add_match("bird");
+    snipgen.accept_text(text);
+    TEST_EQUAL(snipgen.get_snippets(), "delta put-a-<b>bird</b>-on-it epsilon");
+
+    // adding a search term including punctuation is like
+    // adding multiple search terms
+    snipgen.reset();
+    snipgen.add_match("a-bird");
+    snipgen.accept_text(text);
+    TEST_EQUAL(snipgen.get_snippets(), "gamma delta put-<b>a</b>-<b>bird</b>-on-it epsilon");
+
+    snipgen.reset();
+    snipgen.add_match("put-a-bird-on-it");
+    snipgen.accept_text(text);
+    TEST_EQUAL(snipgen.get_snippets(), "beta gamma delta "
+				       "<b>put</b>-<b>a</b>-<b>bird</b>-<b>on</b>-<b>it</b> "
+				       "epsilon zeta eta");
+
+    return true;
+}
+
 /// Test cases for the TermGenerator.
 static const test_desc tests[] = {
     TESTCASE(termgen1),
@@ -1039,6 +1071,7 @@ static const test_desc tests[] = {
     TESTCASE(sg_stem),
     TESTCASE(sg_cjk_punctuation),
     TESTCASE(sg_cjk_ngrams),
+    TESTCASE(sg_match_punctuation),
     END_OF_TESTCASES
 };