Projects
Kolab:3.4
xapian-core
xapian-fix-snippet-generator-CJK.patch
Log In
Username
Password
Overview
Repositories
Revisions
Requests
Users
Attributes
Meta
File xapian-fix-snippet-generator-CJK.patch of Package xapian-core
Index: xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc =================================================================== --- xapian-core-1fmwheezy28569.orig/queryparser/snippetgenerator_internal.cc 2013-05-24 07:12:27.000000000 +0000 +++ xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.cc 2013-05-24 07:12:38.830056601 +0000 @@ -44,8 +44,16 @@ SnippetGenerator::Internal::add_match(co matches.insert(stem); } +// 'ngram_len' is the length in *characters* (not octets) of an N-gram, +// or 0 if 'term' is a complete term. We use this to detect when we +// are being passed N-grams. We also rely on the behaviour of the +// CJKTokenizer class which returns N-grams in increasing length order, +// so that when ngram_len==1 we know the N-gram base position just +// incremented. + void -SnippetGenerator::Internal::accept_term(const string & term, termcount pos) +SnippetGenerator::Internal::accept_term(const string & term, + termcount pos, int ngram_len) { string stem = Unicode::tolower(term); if (stemmer.internal.get()) @@ -55,13 +63,17 @@ SnippetGenerator::Internal::accept_term( if (pos > (lastpos+2)) { context.clear(); leading_nonword = ""; + pending_1gram = ""; + ignore_1grams = 0; } + if (ngram_len <= 1) + xpos += (pos - lastpos); lastpos = pos; nwhitespace = 0; if (matches.find(stem) != matches.end()) { // found a match - if (pos > horizon + context.size() + 1 && result != "") { + if (xpos > horizon + context.size() + 1 && result != "") { // there was a gap from the end of the context after // the previous snippet, so start a new snippet result += inter_snippet; @@ -70,6 +82,11 @@ SnippetGenerator::Internal::accept_term( } leading_nonword = ""; + if (ngram_len == 1 && pending_1gram != "") { + push_context(pending_1gram); + pending_1gram = ""; + } + // flush the before-context while (!context.empty()) { result += context.front(); @@ -81,15 +98,39 @@ SnippetGenerator::Internal::accept_term( result += term; result += post_match; + // some following 1-grams may be included in the + // match text, so don't add them to context. + ignore_1grams = (ngram_len > 1 ? ngram_len-1 : 0); + // set the horizon to mark the end of the after-context - horizon = pos + context_length; - } else if (pos <= horizon) { + horizon = xpos + context_length + ignore_1grams; + } else if (xpos <= horizon) { // the after-context for a match - result += term; + if (ngram_len == 0) { + result += term; + } else if (ngram_len == 1) { + if (ignore_1grams) + ignore_1grams--; + else + result += term; + } + // don't keep N>1 N-grams in context, they're redundant } else { // not explicitly in a context, but remember the // term in the context queue for later - push_context(term); + if (ngram_len == 0) { + push_context(term); + } else if (ngram_len == 1) { + if (pending_1gram != "") { + push_context(pending_1gram); + pending_1gram = ""; + } + if (ignore_1grams) + ignore_1grams--; + else + pending_1gram = term; + } + // don't keep N>1 N-grams in context, they're redundant } } @@ -111,6 +152,7 @@ SnippetGenerator::Internal::accept_nonwo Unicode::append_utf8(leading_nonword, ch); return; } + xpos += (pos - lastpos); if (Unicode::is_whitespace(ch)) { if (++nwhitespace > 1) @@ -120,13 +162,19 @@ SnippetGenerator::Internal::accept_nonwo nwhitespace = 0; } + if (pending_1gram != "") { + push_context(pending_1gram); + pending_1gram = ""; + } + ignore_1grams = 0; + if (!pos) { // non-word characters before the first word Unicode::append_utf8(leading_nonword, ch); } - else if (pos <= horizon) { + else if (xpos <= horizon) { // the last word of the after-context of a snippet - if (ch == ' ' && pos == horizon) { + if (ch == ' ' && xpos == horizon) { // after-context ends on first whitespace // after the last word in the horizon, // ...unless another one abuts it, but we @@ -261,7 +309,7 @@ SnippetGenerator::Internal::accept_text( if (cjk_token.size() > MAX_PROB_TERM_LENGTH) continue; // Add unstemmed form positional information. - accept_term(cjk_token, ++termpos); + accept_term(cjk_token, ++termpos, tk.get_length()); } while (true) { if (itor == Utf8Iterator()) return; @@ -318,7 +366,7 @@ SnippetGenerator::Internal::accept_text( endofterm: if (term.size() > MAX_PROB_TERM_LENGTH) continue; - accept_term(term, ++termpos); + accept_term(term, ++termpos, 0); } } @@ -329,11 +377,14 @@ SnippetGenerator::Internal::reset() result = ""; horizon = 0; lastpos = 0; + xpos = 0; nwhitespace = 0; context.clear(); matches.clear(); termpos = 0; leading_nonword = ""; + pending_1gram = ""; + ignore_1grams = 0; } } Index: xapian-core-1fmwheezy28569/tests/termgentest.cc =================================================================== --- xapian-core-1fmwheezy28569.orig/tests/termgentest.cc 2013-05-24 07:01:21.000000000 +0000 +++ xapian-core-1fmwheezy28569/tests/termgentest.cc 2013-05-24 07:14:13.617074471 +0000 @@ -951,6 +951,51 @@ static bool test_sg_first_nonword() return true; } +static bool test_sg_cjk_punctuation() +{ + Xapian::SnippetGenerator snipgen; + snipgen.set_context_length(3); + Xapian::Stem stemmer("en"); + snipgen.set_stemmer(stemmer); + + std::string text("申込み殺到!Nexus7が0円!WiMAX月額3,770円使い放題とセットでお得"); + + // the text contains U+FF01 FULLWIDTH EXCLAMATION MARK which + // is both a CJK character and a non-word character; it should + // be handled as non-word text and appear in both the before + // and after contexts + snipgen.reset(); + snipgen.add_match("Nexus7"); + snipgen.accept_text(text); + TEST_EQUAL(snipgen.get_snippets(), "み殺到!<b>Nexus7</b>が0円!"); + + return true; +} + +static bool test_sg_cjk_ngrams() +{ + Xapian::SnippetGenerator snipgen; + snipgen.set_context_length(3); + Xapian::Stem stemmer("en"); + snipgen.set_stemmer(stemmer); + + std::string text("申込み殺到!Nexus7が0円!WiMAX月額3,770円使い放題とセットでお得"); + + // a one-character CJK search term should be matched + snipgen.reset(); + snipgen.add_match("放"); + snipgen.accept_text(text); + TEST_EQUAL(snipgen.get_snippets(), "円使い<b>放</b>題とセ"); + + // a two-character CJK search term should be matched + snipgen.reset(); + snipgen.add_match("放題"); + snipgen.accept_text(text); + TEST_EQUAL(snipgen.get_snippets(), "円使い<b>放題</b>とセッ"); + + return true; +} + static bool test_sg_stem() { Xapian::SnippetGenerator snipgen; @@ -992,6 +1037,8 @@ static const test_desc tests[] = { TESTCASE(snipgen1), TESTCASE(sg_first_nonword), TESTCASE(sg_stem), + TESTCASE(sg_cjk_punctuation), + TESTCASE(sg_cjk_ngrams), END_OF_TESTCASES }; Index: xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.h =================================================================== --- xapian-core-1fmwheezy28569.orig/queryparser/snippetgenerator_internal.h 2013-05-24 07:12:27.000000000 +0000 +++ xapian-core-1fmwheezy28569/queryparser/snippetgenerator_internal.h 2013-05-24 07:12:38.830056601 +0000 @@ -40,16 +40,19 @@ class SnippetGenerator::Internal : publi std::tr1::unordered_set<std::string> matches; unsigned nwhitespace; std::string leading_nonword; + std::string pending_1gram; + unsigned ignore_1grams; termcount horizon; termcount lastpos; + termcount xpos; // doesn't count N>1 N-grams std::deque<std::string> context; std::string result; termcount termpos; void push_context(const std::string & term); - void accept_term(const std::string & term, termcount pos); + void accept_term(const std::string & term, termcount pos, int ngram_len); void accept_nonword_char(unsigned ch, termcount pos); public:
Locations
Projects
Search
Status Monitor
Help
Open Build Service
OBS Manuals
API Documentation
OBS Portal
Reporting a Bug
Contact
Mailing List
Forums
Chat (IRC)
Twitter
Open Build Service (OBS)
is an
openSUSE project
.