Projects
Kolab:3.4
xapian-core
xapian-fix-snippet-generator-punctuation-in-mat...
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File xapian-fix-snippet-generator-punctuation-in-match.patch of Package xapian-core
Index: xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc =================================================================== --- xapian-core-1fmwheezy28569.orig/queryparser/snippetgenerator_internal.cc 2013-05-24 07:12:38.830056601 +0000 +++ xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc 2013-05-24 07:14:31.928112153 +0000 @@ -35,15 +35,6 @@ using namespace std; namespace Xapian { -void -SnippetGenerator::Internal::add_match(const std::string & term) -{ - string stem = Unicode::tolower(term); - if (stemmer.internal.get()) - stem = stemmer(stem); - matches.insert(stem); -} - // 'ngram_len' is the length in *characters* (not octets) of an N-gram, // or 0 if 'term' is a complete term. We use this to detect when we // are being passed N-grams. We also rely on the behaviour of the @@ -370,6 +361,48 @@ endofterm: } } +void +SnippetGenerator::Internal::add_match(const std::string & str) +{ + unsigned ch; + Utf8Iterator itor(str); + while (true) { + + // skip non-word characters + while (true) { + if (itor == Utf8Iterator()) return; + ch = check_wordchar(*itor); + ++itor; + if (ch) break; + } + + string term; + Unicode::append_utf8(term, ch); + +#if 0 + // treat CJK characters as 1-character terms + // this is probably wrong. + if (CJK::codepoint_is_cjk(ch)) { + matches.insert(term); + continue; + } +#endif + + // collect word characters into a term + while (true) { + if (itor == Utf8Iterator()) break; + ch = check_wordchar(*itor); + if (!ch) break; + Unicode::append_utf8(term, ch); + ++itor; + } + + string stem = Unicode::tolower(term); + if (stemmer.internal.get()) + stem = stemmer(stem); + matches.insert(stem); + } +} void SnippetGenerator::Internal::reset() Index: xapian-core-1fmwheezy28569/tests/termgentest.cc =================================================================== --- xapian-core-1fmwheezy28569.orig/tests/termgentest.cc 2013-05-24 07:14:13.617074471 +0000 +++ xapian-core-1fmwheezy28569/tests/termgentest.cc 2013-05-24 07:14:31.928112153 +0000 @@ -1028,6 +1028,38 @@ static bool test_sg_stem() return true; } +static bool test_sg_match_punctuation() +{ + Xapian::SnippetGenerator snipgen; + snipgen.set_context_length(3); + Xapian::Stem stemmer("en"); + snipgen.set_stemmer(stemmer); + + std::string text("alpha beta gamma delta put-a-bird-on-it epsilon zeta eta theta"); + + // a simple word search term should be matched + snipgen.reset(); + snipgen.add_match("bird"); + snipgen.accept_text(text); + TEST_EQUAL(snipgen.get_snippets(), "delta put-a-<b>bird</b>-on-it epsilon"); + + // adding a search term including punctuation is like + // adding multiple search terms + snipgen.reset(); + snipgen.add_match("a-bird"); + snipgen.accept_text(text); + TEST_EQUAL(snipgen.get_snippets(), "gamma delta put-<b>a</b>-<b>bird</b>-on-it epsilon"); + + snipgen.reset(); + snipgen.add_match("put-a-bird-on-it"); + snipgen.accept_text(text); + TEST_EQUAL(snipgen.get_snippets(), "beta gamma delta " + "<b>put</b>-<b>a</b>-<b>bird</b>-<b>on</b>-<b>it</b> " + "epsilon zeta eta"); + + return true; +} + /// Test cases for the TermGenerator. static const test_desc tests[] = { TESTCASE(termgen1), @@ -1039,6 +1071,7 @@ static const test_desc tests[] = { TESTCASE(sg_stem), TESTCASE(sg_cjk_punctuation), TESTCASE(sg_cjk_ngrams), + TESTCASE(sg_match_punctuation), END_OF_TESTCASES };
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.