File xapian-fix-snippet-generator-punctuation-in-match.patch of Package xapian-core
Index: xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc
===================================================================
--- xapian-core-1fmwheezy28569.orig/queryparser/snippetgenerator_internal.cc 2013-05-24 07:12:38.830056601 +0000
+++ xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc 2013-05-24 07:14:31.928112153 +0000
@@ -35,15 +35,6 @@ using namespace std;
namespace Xapian {
-void
-SnippetGenerator::Internal::add_match(const std::string & term)
-{
- string stem = Unicode::tolower(term);
- if (stemmer.internal.get())
- stem = stemmer(stem);
- matches.insert(stem);
-}
-
// 'ngram_len' is the length in *characters* (not octets) of an N-gram,
// or 0 if 'term' is a complete term. We use this to detect when we
// are being passed N-grams. We also rely on the behaviour of the
@@ -370,6 +361,48 @@ endofterm:
}
}
+void
+SnippetGenerator::Internal::add_match(const std::string & str)
+{
+ unsigned ch;
+ Utf8Iterator itor(str);
+ while (true) {
+
+ // skip non-word characters
+ while (true) {
+ if (itor == Utf8Iterator()) return;
+ ch = check_wordchar(*itor);
+ ++itor;
+ if (ch) break;
+ }
+
+ string term;
+ Unicode::append_utf8(term, ch);
+
+#if 0
+ // treat CJK characters as 1-character terms
+ // this is probably wrong.
+ if (CJK::codepoint_is_cjk(ch)) {
+ matches.insert(term);
+ continue;
+ }
+#endif
+
+ // collect word characters into a term
+ while (true) {
+ if (itor == Utf8Iterator()) break;
+ ch = check_wordchar(*itor);
+ if (!ch) break;
+ Unicode::append_utf8(term, ch);
+ ++itor;
+ }
+
+ string stem = Unicode::tolower(term);
+ if (stemmer.internal.get())
+ stem = stemmer(stem);
+ matches.insert(stem);
+ }
+}
void
SnippetGenerator::Internal::reset()
Index: xapian-core-1fmwheezy28569/tests/termgentest.cc
===================================================================
--- xapian-core-1fmwheezy28569.orig/tests/termgentest.cc 2013-05-24 07:14:13.617074471 +0000
+++ xapian-core-1fmwheezy28569/tests/termgentest.cc 2013-05-24 07:14:31.928112153 +0000
@@ -1028,6 +1028,38 @@ static bool test_sg_stem()
return true;
}
+static bool test_sg_match_punctuation()
+{
+ Xapian::SnippetGenerator snipgen;
+ snipgen.set_context_length(3);
+ Xapian::Stem stemmer("en");
+ snipgen.set_stemmer(stemmer);
+
+ std::string text("alpha beta gamma delta put-a-bird-on-it epsilon zeta eta theta");
+
+ // a simple word search term should be matched
+ snipgen.reset();
+ snipgen.add_match("bird");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "delta put-a-<b>bird</b>-on-it epsilon");
+
+ // adding a search term including punctuation is like
+ // adding multiple search terms
+ snipgen.reset();
+ snipgen.add_match("a-bird");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "gamma delta put-<b>a</b>-<b>bird</b>-on-it epsilon");
+
+ snipgen.reset();
+ snipgen.add_match("put-a-bird-on-it");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "beta gamma delta "
+ "<b>put</b>-<b>a</b>-<b>bird</b>-<b>on</b>-<b>it</b> "
+ "epsilon zeta eta");
+
+ return true;
+}
+
/// Test cases for the TermGenerator.
static const test_desc tests[] = {
TESTCASE(termgen1),
@@ -1039,6 +1071,7 @@ static const test_desc tests[] = {
TESTCASE(sg_stem),
TESTCASE(sg_cjk_punctuation),
TESTCASE(sg_cjk_ngrams),
+ TESTCASE(sg_match_punctuation),
END_OF_TESTCASES
};