File xapian-snippetgenerator-collated.patch of Package xapian-core

diff -urN xapian-core-1.2.16.orig/include/Makefile.mk xapian-core-1.2.16/include/Makefile.mk
--- xapian-core-1.2.16.orig/include/Makefile.mk	2013-12-04 01:13:14.000000000 +0100
+++ xapian-core-1.2.16/include/Makefile.mk	2014-01-18 12:13:18.299275423 +0100
@@ -28,6 +28,7 @@
 	include/xapian/query.h\
 	include/xapian/queryparser.h\
 	include/xapian/registry.h\
+	include/xapian/snippetgenerator.h\
 	include/xapian/stem.h\
 	include/xapian/termgenerator.h\
 	include/xapian/termiterator.h\
diff -urN xapian-core-1.2.16.orig/include/xapian/snippetgenerator.h xapian-core-1.2.16/include/xapian/snippetgenerator.h
--- xapian-core-1.2.16.orig/include/xapian/snippetgenerator.h	1970-01-01 01:00:00.000000000 +0100
+++ xapian-core-1.2.16/include/xapian/snippetgenerator.h	2014-01-18 12:13:18.299275423 +0100
@@ -0,0 +1,178 @@
+/** @file snippetgenerator.h
+ * @brief parse free text and generate snippets
+ */
+/* Copyright (C) 2007,2009,2011 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef XAPIAN_INCLUDED_SNIPPETGENERATOR_H
+#define XAPIAN_INCLUDED_SNIPPETGENERATOR_H
+
+#include <xapian/base.h>
+#include <xapian/types.h>
+#include <xapian/unicode.h>
+#include <xapian/visibility.h>
+
+#include <string>
+
+namespace Xapian {
+
+class Stem;
+
+/** Parses a piece of text and generate snippets.
+ *
+ * This module takes a piece of text and parses it to produce words which are
+ * then used to match against a set of matched terms previously returned
+ * from a query. The output is a snippet, i.e. string showing the matches
+ * highlighted and some surrounding context.
+ */
+class XAPIAN_VISIBILITY_DEFAULT SnippetGenerator {
+ public:
+ /// @private @internal Class representing the SnippetGenerator internals.
+ class Internal;
+ /// @private @internal Reference counted internals.
+ Xapian::Internal::RefCntPtr<Internal> internal;
+
+ /// Copy constructor.
+ SnippetGenerator(const SnippetGenerator & o);
+
+ /// Assignment.
+ SnippetGenerator & operator=(const SnippetGenerator & o);
+
+ /// Default constructor.
+ SnippetGenerator();
+
+ /// Destructor.
+ ~SnippetGenerator();
+
+ /// Set the Xapian::Stem object to be used for generating stemmed terms.
+ void set_stemmer(const Xapian::Stem & stemmer);
+
+ /** Add a match term to be highlighted.
+ *
+ * The term will be stemmed if a stemmer has been configured.
+ * The term will be matched case insensitively but the case
+ * will be preserved in the snippets output.
+ *
+ * @param term A term to be highlighted in the snippets.
+ */
+ void add_match(const std::string & term);
+
+ /** Accept some text.
+ *
+ * @param itor	Utf8Iterator pointing to the text to index.
+ */
+ void accept_text(const Xapian::Utf8Iterator & itor);
+
+ /** Accept some text in a std::string.
+ *
+ * @param text	The text to index.
+ */
+ void accept_text(const std::string & text) {
+	return accept_text(Utf8Iterator(text));
+ }
+
+ /** Increase the term position used by index_text.
+ *
+ * This can be used between indexing text from different fields or other
+ * places to prevent phrase searches from spanning between them (e.g.
+ * between the title and body text, or between two chapters in a book).
+ *
+ * @param delta	Amount to increase the term position by (default: 100).
+ */
+ void increase_termpos(Xapian::termcount delta = 100);
+
+ /// Get the current term position.
+ Xapian::termcount get_termpos() const;
+
+ /** Set the current term position.
+ *
+ * @param termpos	The new term position to set.
+ */
+ void set_termpos(Xapian::termcount termpos);
+
+ /** Get the pre-match string.
+ */
+ std::string get_pre_match() const;
+
+ /** Set the pre-match string.
+ *
+ * Matched terms are shown in the snippets string surrounded
+ * by the pre-match and post-match strings, whose default values
+ * are "" and "".
+ *
+ * @param text The new pre-match string to set.
+ */
+ void set_pre_match(std::string & text);
+
+ /** Get the post-match string.
+ */
+ std::string get_post_match() const;
+
+ /** Set the post-match string.
+ *
+ * Matched terms are shown in each snippet string surrounded
+ * by the pre-match and post-match strings, whose default values
+ * are "" and "".
+ *
+ * @param text The new post-match string to set.
+ */
+ void set_post_match(std::string & text);
+
+ /** Get the inter-snippet string.
+ */
+ std::string get_inter_snippet() const;
+
+ /** Set the inter-snippet string.
+ *
+ * Snippets are shown separated by the inter-snippet string, whose
+ * default value is "...".
+ *
+ * @param text The new inter-snippet string to set.
+ */
+ void set_inter_snippet(std::string & text);
+
+ /** Get the context length, in words
+ */
+ unsigned get_context_length() const;
+
+ /** Set the context length, in words
+ *
+ * Before and after each highlighted match, each snippet
+ * shows some context. This parameter controls how many
+ * words of context are shown.
+ */
+ void set_context_length(unsigned length);
+
+ /** Get the resulting snippets string.
+ */
+ std::string get_snippets();
+
+ /** Reset the snippets state for another set of text.
+ *
+ * Resets the snippet results, matches, termpos, and any saved
+ * context, so that another document or another field of the same
+ * document can be accepted. Preserves parameters like stemmer etc.
+ */
+ void reset();
+
+ /// Return a string describing this object.
+ std::string get_description() const;
+};
+
+}
+
+#endif // XAPIAN_INCLUDED_SNIPPETGENERATOR_H
diff -urN xapian-core-1.2.16.orig/include/xapian.h xapian-core-1.2.16/include/xapian.h
--- xapian-core-1.2.16.orig/include/xapian.h	2013-12-04 01:13:14.000000000 +0100
+++ xapian-core-1.2.16/include/xapian.h	2014-01-18 12:13:18.299275423 +0100
@@ -68,6 +68,7 @@
 #include <xapian/queryparser.h>
 #include <xapian/valuesetmatchdecider.h>
 #include <xapian/weight.h>
+#include <xapian/snippetgenerator.h>
 
 // Stemming
 #include <xapian/stem.h>
diff -urN xapian-core-1.2.16.orig/queryparser/Makefile.mk xapian-core-1.2.16/queryparser/Makefile.mk
--- xapian-core-1.2.16.orig/queryparser/Makefile.mk	2013-12-04 01:13:23.000000000 +0100
+++ xapian-core-1.2.16/queryparser/Makefile.mk	2014-01-18 12:13:18.300275419 +0100
@@ -8,7 +8,8 @@
 	queryparser/cjk-tokenizer.h\
 	queryparser/queryparser_internal.h\
 	queryparser/queryparser_token.h\
-	queryparser/termgenerator_internal.h
+	queryparser/termgenerator_internal.h\
+	queryparser/snippetgenerator_internal.h
 
 lemon_built_sources =\
 	queryparser/queryparser_internal.cc\
@@ -62,4 +63,6 @@
 	queryparser/queryparser.cc\
 	queryparser/queryparser_internal.cc\
 	queryparser/termgenerator.cc\
-	queryparser/termgenerator_internal.cc
+	queryparser/termgenerator_internal.cc\
+	queryparser/snippetgenerator.cc\
+	queryparser/snippetgenerator_internal.cc
diff -urN xapian-core-1.2.16.orig/queryparser/snippetgenerator.cc xapian-core-1.2.16/queryparser/snippetgenerator.cc
--- xapian-core-1.2.16.orig/queryparser/snippetgenerator.cc	1970-01-01 01:00:00.000000000 +0100
+++ xapian-core-1.2.16/queryparser/snippetgenerator.cc	2014-01-18 12:13:18.301275414 +0100
@@ -0,0 +1,151 @@
+/** @file snippetgenerator.cc
+ * @brief SnippetGenerator class implementation
+ */
+/* Copyright (C) 2007 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <config.h>
+
+#include <xapian/snippetgenerator.h>
+#include <xapian/types.h>
+
+#include "snippetgenerator_internal.h"
+
+#include "str.h"
+
+using namespace std;
+using namespace Xapian;
+
+SnippetGenerator::SnippetGenerator(const SnippetGenerator & o) : internal(o.internal) { }
+
+SnippetGenerator &
+SnippetGenerator::operator=(const SnippetGenerator & o) {
+ internal = o.internal;
+ return *this;
+}
+
+SnippetGenerator::SnippetGenerator() : internal(new SnippetGenerator::Internal) { }
+
+SnippetGenerator::~SnippetGenerator() { }
+
+void
+SnippetGenerator::set_stemmer(const Xapian::Stem & stemmer)
+{
+ internal->stemmer = stemmer;
+}
+
+void
+SnippetGenerator::add_match(const std::string & term)
+{
+ internal->add_match(term);
+}
+
+void
+SnippetGenerator::accept_text(const Xapian::Utf8Iterator & itor)
+{
+ internal->accept_text(itor);
+}
+
+void
+SnippetGenerator::increase_termpos(Xapian::termcount delta)
+{
+ internal->termpos += delta;
+}
+
+Xapian::termcount
+SnippetGenerator::get_termpos() const
+{
+ return internal->termpos;
+}
+
+void
+SnippetGenerator::set_termpos(Xapian::termcount termpos)
+{
+ internal->termpos = termpos;
+}
+
+std::string
+SnippetGenerator::get_pre_match() const
+{
+ return internal->pre_match;
+}
+
+void
+SnippetGenerator::set_pre_match(std::string & text)
+{
+ internal->pre_match = text;
+}
+
+std::string
+SnippetGenerator::get_post_match() const
+{
+ return internal->post_match;
+}
+
+void
+SnippetGenerator::set_post_match(std::string & text)
+{
+ internal->post_match = text;
+}
+
+std::string
+SnippetGenerator::get_inter_snippet() const
+{
+ return internal->inter_snippet;
+}
+
+void
+SnippetGenerator::set_inter_snippet(std::string & text)
+{
+ internal->inter_snippet = text;
+}
+
+unsigned
+SnippetGenerator::get_context_length() const
+{
+ return internal->context_length;
+}
+
+void
+SnippetGenerator::set_context_length(unsigned length)
+{
+ internal->context_length = length;
+}
+
+std::string
+SnippetGenerator::get_snippets()
+{
+ return internal->result;
+}
+
+void
+SnippetGenerator::reset()
+{
+ internal->reset();
+}
+
+string
+SnippetGenerator::get_description() const
+{
+ string s("Xapian::SnippetGenerator(");
+ if (internal.get()) {
+	s += "stem=";
+	s += internal->stemmer.get_description();
+ }
+ s += ")";
+ return s;
+}
diff -urN xapian-core-1.2.16.orig/queryparser/snippetgenerator_internal.cc xapian-core-1.2.16/queryparser/snippetgenerator_internal.cc
--- xapian-core-1.2.16.orig/queryparser/snippetgenerator_internal.cc	1970-01-01 01:00:00.000000000 +0100
+++ xapian-core-1.2.16/queryparser/snippetgenerator_internal.cc	2014-01-18 12:15:09.221768695 +0100
@@ -0,0 +1,423 @@
+/** @file snippetgenerator_internal.cc
+ * @brief SnippetGenerator class internals
+ */
+/* Copyright (C) 2007,2010,2011 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <config.h>
+
+#include "snippetgenerator_internal.h"
+
+#include <xapian/queryparser.h>
+#include <xapian/unicode.h>
+
+#include "stringutils.h"
+
+#include <limits>
+#include <string>
+#include "cjk-tokenizer.h"
+
+using namespace std;
+
+namespace Xapian {
+
+// 'ngram_len' is the length in *characters* (not octets) of an N-gram,
+// or 0 if 'term' is a complete term. We use this to detect when we
+// are being passed N-grams. We also rely on the behaviour of the
+// CJKTokenizer class which returns N-grams in increasing length order,
+// so that when ngram_len==1 we know the N-gram base position just
+// incremented.
+
+void
+SnippetGenerator::Internal::accept_term(const string & term,
+					termcount pos, int ngram_len)
+{
+ string stem = Unicode::tolower(term);
+ if (stemmer.internal.get())
+	stem = stemmer(stem);
+
+ // we don't keep context across termpos discontinuities
+ if (pos > (lastpos+2)) {
+	context.clear();
+	leading_nonword = "";
+	pending_1gram = "";
+	ignore_1grams = 0;
+ }
+ if (ngram_len <= 1)
+	xpos += (pos - lastpos);
+ lastpos = pos;
+ nwhitespace = 0;
+
+ if (matches.find(stem) != matches.end()) {
+	// found a match
+	if (xpos > horizon + context.size() + 1 && result != "") {
+	 // there was a gap from the end of the context after
+	 // the previous snippet, so start a new snippet
+	 result += inter_snippet;
+	} else {
+	 result += leading_nonword;
+	}
+	leading_nonword = "";
+
+	if (ngram_len == 1 && pending_1gram != "") {
+	 push_context(pending_1gram);
+	 pending_1gram = "";
+	}
+
+	// flush the before-context
+	while (!context.empty()) {
+	 result += context.front();
+	 context.pop_front();
+	}
+
+	// emit the match, highlighted
+	result += pre_match;
+	result += term;
+	result += post_match;
+
+	// some following 1-grams may be included in the
+	// match text, so don't add them to context.
+	ignore_1grams = (ngram_len > 1 ? ngram_len-1 : 0);
+
+	// set the horizon to mark the end of the after-context
+	horizon = xpos + context_length + ignore_1grams;
+ } else if (xpos <= horizon) {
+	// the after-context for a match
+	if (ngram_len == 0) {
+	 result += term;
+	} else if (ngram_len == 1) {
+	 if (ignore_1grams)
+		ignore_1grams--;
+	 else
+		result += term;
+	}
+	// don't keep N>1 N-grams in context, they're redundant
+ } else {
+	// not explicitly in a context, but remember the
+	// term in the context queue for later
+	if (ngram_len == 0) {
+	 push_context(term);
+	} else if (ngram_len == 1) {
+	 if (pending_1gram != "") {
+		push_context(pending_1gram);
+		pending_1gram = "";
+	 }
+	 if (ignore_1grams)
+		ignore_1grams--;
+	 else
+		pending_1gram = term;
+	}
+	// don't keep N>1 N-grams in context, they're redundant
+ }
+}
+
+void
+SnippetGenerator::Internal::push_context(const string & term)
+{
+ context.push_back(term);
+ while (context.size() > context_length) {
+	context.pop_front();
+	leading_nonword = "";
+ }
+ // this order handles the context_length=0 case gracefully
+}
+
+void
+SnippetGenerator::Internal::accept_nonword_char(unsigned ch, termcount pos)
+{
+ if (context.empty() && leading_nonword != "") {
+	Unicode::append_utf8(leading_nonword, ch);
+	return;
+ }
+ xpos += (pos - lastpos);
+
+ if (Unicode::is_whitespace(ch)) {
+	if (++nwhitespace > 1)
+	 return;
+	ch = ' ';
+ } else {
+	nwhitespace = 0;
+ }
+
+ if (pending_1gram != "") {
+	push_context(pending_1gram);
+	pending_1gram = "";
+ }
+ ignore_1grams = 0;
+
+ if (!pos) {
+	// non-word characters before the first word
+	Unicode::append_utf8(leading_nonword, ch);
+ }
+ else if (xpos <= horizon) {
+	// the last word of the after-context of a snippet
+	if (ch == ' ' && xpos == horizon) {
+	 // after-context ends on first whitespace
+	 // after the last word in the horizon,
+	 // ...unless another one abuts it, but we
+	 // don't know that yet, so keep it around
+	 // just in case
+	 Unicode::append_utf8(leading_nonword, ch);
+	 return;
+	}
+
+	// the after-context for a match
+	Unicode::append_utf8(result, ch);
+ } else {
+	if (context.size())
+	 Unicode::append_utf8(context.back(), ch);
+ }
+}
+
+// Put a limit on the size of terms to help prevent the index being bloated
+// by useless junk terms.
+static const unsigned int MAX_PROB_TERM_LENGTH = 64;
+// FIXME: threshold is currently in bytes of UTF-8 representation, not unicode
+// characters - what actually makes most sense here?
+
+inline bool
+U_isupper(unsigned ch) {
+ return (ch < 128 && C_isupper((unsigned char)ch));
+}
+
+inline unsigned check_wordchar(unsigned ch) {
+ if (Unicode::is_wordchar(ch)) return ch;
+ return 0;
+}
+
+/** Value representing "ignore this" when returned by check_infix() or
+ * check_infix_digit().
+ */
+const unsigned UNICODE_IGNORE = numeric_limits<unsigned>::max();
+
+inline unsigned check_infix(unsigned ch) {
+ if (ch == '\'' || ch == '&' || ch == 0xb7 || ch == 0x5f4 || ch == 0x2027) {
+	// Unicode includes all these except '&' in its word boundary rules,
+	// as well as 0x2019 (which we handle below) and ':' (for Swedish
+	// apparently, but we ignore this for now as it's problematic in
+	// real world cases).
+	return ch;
+ }
+ // 0x2019 is Unicode apostrophe and single closing quote.
+ // 0x201b is Unicode single opening quote with the tail rising.
+ if (ch == 0x2019 || ch == 0x201b) return '\'';
+ if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
+	return UNICODE_IGNORE;
+ return 0;
+}
+
+inline unsigned check_infix_digit(unsigned ch) {
+ // This list of characters comes from Unicode's word identifying algorithm.
+ switch (ch) {
+	case ',':
+	case '.':
+	case ';':
+	case 0x037e: // GREEK QUESTION MARK
+	case 0x0589: // ARMENIAN FULL STOP
+	case 0x060D: // ARABIC DATE SEPARATOR
+	case 0x07F8: // NKO COMMA
+	case 0x2044: // FRACTION SLASH
+	case 0xFE10: // PRESENTATION FORM FOR VERTICAL COMMA
+	case 0xFE13: // PRESENTATION FORM FOR VERTICAL COLON
+	case 0xFE14: // PRESENTATION FORM FOR VERTICAL SEMICOLON
+	 return ch;
+ }
+ if (ch >= 0x200b && (ch <= 0x200d || ch == 0x2060 || ch == 0xfeff))
+	return UNICODE_IGNORE;
+ return 0;
+}
+
+inline bool
+is_digit(unsigned ch) {
+ return (Unicode::get_category(ch) == Unicode::DECIMAL_DIGIT_NUMBER);
+}
+
+inline unsigned check_suffix(unsigned ch) {
+ if (ch == '+' || ch == '#') return ch;
+ // FIXME: what about '-'?
+ return 0;
+}
+
+void
+SnippetGenerator::Internal::accept_text(Utf8Iterator itor)
+{
+ bool cjk_ngram = CJK::is_cjk_enabled();
+
+ while (true) {
+	// Advance to the start of the next term.
+	unsigned ch;
+	while (true) {
+	 if (itor == Utf8Iterator()) return;
+	 ch = check_wordchar(*itor);
+	 if (ch) break;
+	 accept_nonword_char(*itor, termpos);
+	 ++itor;
+	}
+
+	string term;
+	// Look for initials separated by '.' (e.g. P.T.O., U.N.C.L.E).
+	// Don't worry if there's a trailing '.' or not.
+	if (U_isupper(*itor)) {
+	 const Utf8Iterator end;
+	 Utf8Iterator p = itor;
+	 do {
+		Unicode::append_utf8(term, Unicode::tolower(*p++));
+	 } while (p != end && *p == '.' && ++p != end && U_isupper(*p));
+	 // One letter does not make an acronym! If we handled a single
+	 // uppercase letter here, we wouldn't catch M&S below.
+	 if (term.size() > 1) {
+		// Check there's not a (lower case) letter or digit
+		// immediately after it.
+		if (p == end || !Unicode::is_wordchar(*p)) {
+		 itor = p;
+		 goto endofterm;
+		}
+	 }
+	 term.resize(0);
+	}
+
+	while (true) {
+	 if (cjk_ngram &&
+		CJK::codepoint_is_cjk(*itor) &&
+		Unicode::is_wordchar(*itor)) {
+		const string & cjk = CJK::get_cjk(itor);
+		for (CJKTokenIterator tk(cjk); tk != CJKTokenIterator(); ++tk) {
+		 const string & cjk_token = *tk;
+		 if (cjk_token.size() > MAX_PROB_TERM_LENGTH) continue;
+
+		 // Add unstemmed form positional information.
+		 accept_term(cjk_token, ++termpos, tk.get_length());
+		}
+		while (true) {
+		 if (itor == Utf8Iterator()) return;
+		 ch = check_wordchar(*itor);
+		 if (ch) break;
+		 accept_nonword_char(*itor, termpos);
+		 ++itor;
+		}
+	 }
+	 unsigned prevch;
+	 do {
+		Unicode::append_utf8(term, ch);
+		prevch = ch;
+		if (++itor == Utf8Iterator() ||
+		 (cjk_ngram && CJK::codepoint_is_cjk(*itor)))
+		 goto endofterm;
+		ch = check_wordchar(*itor);
+	 } while (ch);
+
+	 Utf8Iterator next(itor);
+	 ++next;
+	 if (next == Utf8Iterator()) break;
+	 unsigned nextch = check_wordchar(*next);
+	 if (!nextch) break;
+	 unsigned infix_ch = *itor;
+	 if (is_digit(prevch) && is_digit(*next)) {
+		infix_ch = check_infix_digit(infix_ch);
+	 } else {
+		// Handle things like '&' in AT&T, apostrophes, etc.
+		infix_ch = check_infix(infix_ch);
+	 }
+	 if (!infix_ch) break;
+	 if (infix_ch != UNICODE_IGNORE)
+		Unicode::append_utf8(term, infix_ch);
+	 ch = nextch;
+	 itor = next;
+	}
+
+	{
+	 size_t len = term.size();
+	 unsigned count = 0;
+	 while ((ch = check_suffix(*itor))) {
+		if (++count > 3) {
+		 term.resize(len);
+		 break;
+		}
+		Unicode::append_utf8(term, ch);
+		if (++itor == Utf8Iterator()) goto endofterm;
+	 }
+	 // Don't index fish+chips as fish+ chips.
+	 if (Unicode::is_wordchar(*itor))
+		term.resize(len);
+	}
+
+endofterm:
+	if (term.size() > MAX_PROB_TERM_LENGTH) continue;
+	accept_term(term, ++termpos, 0);
+ }
+}
+
+void
+SnippetGenerator::Internal::add_match(const std::string & str)
+{
+ unsigned ch;
+ Utf8Iterator itor(str);
+ while (true) {
+
+	// skip non-word characters
+	while (true) {
+	 if (itor == Utf8Iterator()) return;
+	 ch = check_wordchar(*itor);
+	 ++itor;
+	 if (ch) break;
+	}
+
+	string term;
+	Unicode::append_utf8(term, ch);
+
+#if 0
+	// treat CJK characters as 1-character terms
+	// this is probably wrong.
+	if (CJK::codepoint_is_cjk(ch)) {
+	 matches.insert(term);
+	 continue;
+	}
+#endif
+
+	// collect word characters into a term
+	while (true) {
+	 if (itor == Utf8Iterator()) break;
+	 ch = check_wordchar(*itor);
+	 if (!ch) break;
+	 Unicode::append_utf8(term, ch);
+	 ++itor;
+	}
+
+	string stem = Unicode::tolower(term);
+	if (stemmer.internal.get())
+	 stem = stemmer(stem);
+	matches.insert(stem);
+ }
+}
+
+void
+SnippetGenerator::Internal::reset()
+{
+ result = "";
+ horizon = 0;
+ lastpos = 0;
+ xpos = 0;
+ nwhitespace = 0;
+ context.clear();
+ matches.clear();
+ termpos = 0;
+ leading_nonword = "";
+ pending_1gram = "";
+ ignore_1grams = 0;
+}
+
+}
diff -urN xapian-core-1.2.16.orig/queryparser/snippetgenerator_internal.h xapian-core-1.2.16/queryparser/snippetgenerator_internal.h
--- xapian-core-1.2.16.orig/queryparser/snippetgenerator_internal.h	1970-01-01 01:00:00.000000000 +0100
+++ xapian-core-1.2.16/queryparser/snippetgenerator_internal.h	2014-01-18 12:15:01.721802963 +0100
@@ -0,0 +1,70 @@
+/** @file snippetgenerator_internal.h
+ * @brief SnippetGenerator class internals
+ */
+/* Copyright (C) 2007 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef XAPIAN_INCLUDED_SNIPPETGENERATOR_INTERNAL_H
+#define XAPIAN_INCLUDED_SNIPPETGENERATOR_INTERNAL_H
+
+#include <xapian/base.h>
+#include <xapian/snippetgenerator.h>
+#include <xapian/queryparser.h>
+#include <xapian/stem.h>
+#include <tr1/unordered_set>
+#include <queue>
+
+namespace Xapian {
+
+class SnippetGenerator::Internal : public Xapian::Internal::RefCntBase {
+ friend class SnippetGenerator;
+ Stem stemmer;
+ std::string pre_match;
+ std::string post_match;
+ std::string inter_snippet;
+ termcount context_length;
+ std::tr1::unordered_set<std::string> matches;
+ unsigned nwhitespace;
+ std::string leading_nonword;
+ std::string pending_1gram;
+ unsigned ignore_1grams;
+
+ termcount horizon;
+ termcount lastpos;
+ termcount xpos;	// doesn't count N>1 N-grams
+ std::deque<std::string> context;
+ std::string result;
+
+ termcount termpos;
+
+ void push_context(const std::string & term);
+ void accept_term(const std::string & term, termcount pos, int ngram_len);
+ void accept_nonword_char(unsigned ch, termcount pos);
+
+ public:
+ Internal() : pre_match(""), post_match(""),
+		 inter_snippet("..."), context_length(5),
+		 nwhitespace(0), horizon(0), lastpos(0),
+		 termpos(0) { }
+ void add_match(const std::string & term);
+ void accept_text(Utf8Iterator itor);
+ void reset();
+};
+
+}
+
+#endif // XAPIAN_INCLUDED_SNIPPETGENERATOR_INTERNAL_H
diff -urN xapian-core-1.2.16.orig/tests/termgentest.cc xapian-core-1.2.16/tests/termgentest.cc
--- xapian-core-1.2.16.orig/tests/termgentest.cc	2013-12-04 01:13:56.000000000 +0100
+++ xapian-core-1.2.16/tests/termgentest.cc	2014-01-18 12:15:09.222768690 +0100
@@ -1,4 +1,4 @@
-/* termgentest.cc: Tests of Xapian::TermGenerator
+/* termgentest.cc: Tests of Xapian::TermGenerator & SnippetGenerator
 *
 * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2009,2010,2013 Olly Betts
 * Copyright (C) 2007 Lemur Consulting Ltd
@@ -835,12 +835,243 @@
 return true;
 }
 
+static bool test_snipgen1()
+{
+ Xapian::SnippetGenerator snipgen;
+ snipgen.set_context_length(3);
+
+ std::string text("Readymade photo booth (typeWriter), occupy "
+		 "DreamCatcher Polaroid four loko butcher "
+		 "gentrify. Williamsburg, banh mi gastropub.");
+
+ snipgen.reset();
+ snipgen.add_match("readymade");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "Readymade "
+				 "photo booth (typeWriter),");
+
+ snipgen.reset();
+ snipgen.add_match("Readymade");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "Readymade "
+				 "photo booth (typeWriter),");
+
+ snipgen.reset();
+ snipgen.add_match("reAdyMaDe");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "Readymade "
+				 "photo booth (typeWriter),");
+
+ snipgen.reset();
+ snipgen.add_match("photo");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "Readymade "
+				 "photo "
+				 "booth (typeWriter), occupy");
+
+ snipgen.reset();
+ snipgen.add_match("occupy");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "photo booth (typeWriter), "
+				 "occupy "
+				 "DreamCatcher Polaroid four");
+
+ snipgen.reset();
+ snipgen.add_match("gastropub");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "Williamsburg, banh mi "
+				 "gastropub.");
+
+ snipgen.reset();
+ snipgen.add_match("dreamcatcher");
+ snipgen.add_match("butcher");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "booth (typeWriter), occupy "
+				 "DreamCatcher "
+				 "Polaroid four loko "
+				 "butcher "
+				 "gentrify. Williamsburg, banh");
+
+ snipgen.reset();
+ snipgen.add_match("occupy");
+ snipgen.add_match("williamsburg");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "photo booth (typeWriter), "
+				 "occupy "
+				 "DreamCatcher Polaroid four loko butcher gentrify. "
+				 "Williamsburg, "
+				 "banh mi gastropub.");
+
+ snipgen.reset();
+ snipgen.add_match("occupy");
+ snipgen.add_match("banh");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "photo booth (typeWriter), "
+				 "occupy "
+				 "DreamCatcher Polaroid four"
+				 "..."
+				 "butcher gentrify. Williamsburg, "
+				 "banh "
+				 "mi gastropub.");
+
+ return true;
+}
+
+static bool test_sg_first_nonword()
+{
+ Xapian::SnippetGenerator snipgen;
+ snipgen.set_context_length(3);
+
+ // first character is a non-word character - this
+ // exercises some obscure code paths
+ std::string text("[Brooklyn] Re: locavore cosby sweater");
+
+ // no match at all
+ snipgen.reset();
+ snipgen.add_match("readymade");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "");
+
+ // matched first word
+ snipgen.reset();
+ snipgen.add_match("brooklyn");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "["
+				 "Brooklyn"
+				 "] Re: locavore cosby");
+
+ // matched second word
+ snipgen.reset();
+ snipgen.add_match("re");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "[Brooklyn] "
+				 "Re: "
+				 "locavore cosby sweater");
+
+ return true;
+}
+
+static bool test_sg_cjk_punctuation()
+{
+ Xapian::SnippetGenerator snipgen;
+ snipgen.set_context_length(3);
+ Xapian::Stem stemmer("en");
+ snipgen.set_stemmer(stemmer);
+
+ std::string text("申込み殺到！Nexus7が0円！WiMAX月額3,770円使い放題とセットでお得");
+
+ // the text contains U+FF01 FULLWIDTH EXCLAMATION MARK which
+ // is both a CJK character and a non-word character; it should
+ // be handled as non-word text and appear in both the before
+ // and after contexts
+ snipgen.reset();
+ snipgen.add_match("Nexus7");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "み殺到！Nexus7が0円！");
+
+ return true;
+}
+
+static bool test_sg_cjk_ngrams()
+{
+ Xapian::SnippetGenerator snipgen;
+ snipgen.set_context_length(3);
+ Xapian::Stem stemmer("en");
+ snipgen.set_stemmer(stemmer);
+
+ std::string text("申込み殺到！Nexus7が0円！WiMAX月額3,770円使い放題とセットでお得");
+
+ // a one-character CJK search term should be matched
+ snipgen.reset();
+ snipgen.add_match("放");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "円使い放題とセ");
+
+ // a two-character CJK search term should be matched
+ snipgen.reset();
+ snipgen.add_match("放題");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "円使い放題とセッ");
+
+ return true;
+}
+
+static bool test_sg_stem()
+{
+ Xapian::SnippetGenerator snipgen;
+ snipgen.set_context_length(3);
+ Xapian::Stem stemmer("en");
+ snipgen.set_stemmer(stemmer);
+
+ std::string text("She sells sea shells by the sea shore");
+
+ snipgen.reset();
+ snipgen.add_match("shells");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "She sells sea shells by the sea");
+
+ snipgen.reset();
+ snipgen.add_match("shell");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "She sells sea shells by the sea");
+
+ snipgen.reset();
+ snipgen.add_match("shelled");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "She sells sea shells by the sea");
+
+ snipgen.reset();
+ snipgen.add_match("shelling");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "She sells sea shells by the sea");
+
+ return true;
+}
+
+static bool test_sg_match_punctuation()
+{
+ Xapian::SnippetGenerator snipgen;
+ snipgen.set_context_length(3);
+ Xapian::Stem stemmer("en");
+ snipgen.set_stemmer(stemmer);
+
+ std::string text("alpha beta gamma delta put-a-bird-on-it epsilon zeta eta theta");
+
+ // a simple word search term should be matched
+ snipgen.reset();
+ snipgen.add_match("bird");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "delta put-a-bird-on-it epsilon");
+
+ // adding a search term including punctuation is like
+ // adding multiple search terms
+ snipgen.reset();
+ snipgen.add_match("a-bird");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "gamma delta put-a-bird-on-it epsilon");
+
+ snipgen.reset();
+ snipgen.add_match("put-a-bird-on-it");
+ snipgen.accept_text(text);
+ TEST_EQUAL(snipgen.get_snippets(), "beta gamma delta "
+				 "put-a-bird-on-it "
+				 "epsilon zeta eta");
+
+ return true;
+}
+
 /// Test cases for the TermGenerator.
 static const test_desc tests[] = {
 TESTCASE(termgen1),
 TESTCASE(tg_spell1),
 TESTCASE(tg_spell2),
 TESTCASE(tg_max_word_length1),
+ TESTCASE(snipgen1),
+ TESTCASE(sg_first_nonword),
+ TESTCASE(sg_stem),
+ TESTCASE(sg_cjk_punctuation),
+ TESTCASE(sg_cjk_ngrams),
+ TESTCASE(sg_match_punctuation),
 END_OF_TESTCASES
 };