presage 0.9.1
contextTracker.cpp
Go to the documentation of this file.
1
2/******************************************************
3 * Presage, an extensible predictive text entry system
4 * ---------------------------------------------------
5 *
6 * Copyright (C) 2008 Matteo Vescovi <matteo.vescovi@yahoo.co.uk>
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License along
19 with this program; if not, write to the Free Software Foundation, Inc.,
20 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21 *
22 **********(*)*/
23
24
25#include "contextTracker.h"
26#include "../utility.h"
27#include "../predictorRegistry.h"
28#include "../tokenizer/forwardTokenizer.h"
29
30#include <stdlib.h> // for atoi()
31
32const char* ContextTracker::LOGGER = "Presage.ContextTracker.LOGGER";
33const char* ContextTracker::SLIDING_WINDOW_SIZE = "Presage.ContextTracker.SLIDING_WINDOW_SIZE";
34const char* ContextTracker::LOWERCASE_MODE = "Presage.ContextTracker.LOWERCASE_MODE";
35const char* ContextTracker::ONLINE_LEARNING = "Presage.ContextTracker.ONLINE_LEARNING";
36
38 PredictorRegistry* registry,
39 PresageCallback* callback,
40 const unsigned char wChars[],
41 const char tChars[],
42 const char bChars[],
43 const char cChars[])
44 : wordChars (reinterpret_cast<const char *>(wChars)),
45 separatorChars (tChars),
46 blankspaceChars(bChars),
47 controlChars (cChars),
48 predictorRegistry (registry),
49 logger ("ContextTracker", std::cerr),
50 //tokenizer (pastStream, blankspaceChars, separatorChars),
51 lowercase_mode (true),
52 dispatcher (this)
53{
54 if (callback) {
56 } else {
57 throw new PresageException(PRESAGE_INVALID_CALLBACK_ERROR, "Invalid callback object");
58 }
59
65
66 // set pointer to this context tracker in predictor registry so that
67 // predictors can be constructed when next iterator is requested
68 //
71 }
72
73 // build dispatch map
78}
79
81{
83}
84
85void ContextTracker::set_logger (const std::string& value)
86{
87 logger << setlevel (value);
88 logger << INFO << "LOGGER: " << value << endl;
89}
90
91void ContextTracker::set_sliding_window_size (const std::string& value)
92{
94 logger << INFO << "SLIDING_WINDOWS_SIZE: " << value << endl;
95}
96
97void ContextTracker::set_lowercase_mode (const std::string& value)
98{
100 logger << INFO << "LOWERCASE_MODE: " << value << endl;
101}
102
103void ContextTracker::set_online_learning(const std::string& value)
104{
106 logger << INFO << "ONLINE_LEARNING: " << value << endl;
107}
108
110{
112 if (new_callback) {
113 context_tracker_callback = new_callback;
114 }
115 return result;
116}
117
122{
124}
125
127{
128 // detect change that needs to be learned
129 std::string change = contextChangeDetector->change(getPastStream());
130
131 if (online_learning)
132 {
133 learn (change);
134 }
135
136 // update sliding window
138}
139
140void ContextTracker::learn(const std::string& text) const
141{
142 logger << INFO << "learn(): text: " << text << endl;
143
144 std::stringstream stream_to_learn(text);
145
146 // split stream up into tokens
147 std::vector<std::string> tokens;
148 ForwardTokenizer tok(stream_to_learn,
152 logger << INFO << "learn(): tokenized change: ";
153 while (tok.hasMoreTokens()) {
154 std::string token = tok.nextToken();
155 tokens.push_back(token);
156 logger << INFO << token << '|';
157 }
158 logger << INFO << endl;
159
160 if (! tokens.empty()) {
161 // remove prefix (partially entered token or empty token)
162 tokens.pop_back();
163 }
164
165 if ((logger << INFO).shouldLog())
166 {
167 logger << "learn(): sanitized change: ";
168 for (std::vector<std::string>::const_iterator it = tokens.begin();
169 it != tokens.end();
170 it++) {
171 logger << *it << '|';
172 }
173 logger << endl;
174 }
175
176 // time to learn
178 Predictor* predictor = 0;
179
180 while (it.hasNext()) {
181 predictor = it.next();
182 predictor->learn(tokens);
183 }
184}
185
186std::string ContextTracker::getPrefix() const
187{
188 return getToken(0);
189}
190
191std::string ContextTracker::getToken(const int index) const
192{
193 std::stringstream pastStringStream(context_tracker_callback->get_past_stream());
194 ReverseTokenizer tokenizer(pastStringStream, blankspaceChars, separatorChars);
195 tokenizer.lowercaseMode(lowercase_mode);
196
197 std::string token;
198 int i = 0;
199 while (tokenizer.hasMoreTokens() && i <= index) {
200 token = tokenizer.nextToken();
201 i++;
202 }
203 if (i <= index) {
204 // in case the index points too far back
205 token = "";
206 }
207 return token;
208
210// "a b c"
211// 2 1 0
212// 0 1 2
213// 1 2 3
214//
215// ForwardTokenizer tokenizer(pastStream, blankspaceChars, separatorChars);
216// tokenizer.lowercaseMode(lowercase_mode);
217// std::string result;
218// int tokens = tokenizer.countTokens();
219// // why oh why is this clear() required to get it to work???
220// pastStream.clear();
221// int j = 0;
222// while (tokenizer.hasMoreTokens() && j < tokens - index) {
223// result = tokenizer.nextToken();
224// j++;
225//
226// std::cerr << "ContextTracker::getToken() current token: " << result << std::endl;
227// }
228// return result;
229}
230
231std::string ContextTracker::getExtraTokenToLearn(const int index, const std::vector<std::string>& change) const
232{
233 //logger << DEBUG
234 // << "past_stream : " << getPastStream() << endl
235 // << "change : " << contextChangeDetector->change(getPastStream()) << endl
236 // << "sliding_window: " << contextChangeDetector->get_sliding_window() + "\n" << endl;
237
238
239 // Extra tokens to learn are to be found in (past_stream - change)
240 //
241 // The change tokens are tokens that have not been seen or learnt
242 // before.
243 //
244 // The extra tokens to learn are tokens that have been seen and
245 // learn before, but that we need to reuse to fill out the n-gram
246 // of required cardinality that we are about to learn.
247 //
248 // To find the extra tokens to learn, we use the size of tokenized
249 // change vector to offset the index and extract the extra tokens
250 // to learn from the past stream.
251 //
252 // For example:
253 // past_stream : "The quick brown fox jumped over the "
254 // change : |over|the|
255 // extra_tokens: |The|quick|brown|fox|jumped|
256 //
257 return getToken(index + change.size());
258}
259
261{
263}
264
266{
267 std::string result = context_tracker_callback->get_past_stream();
268 return result;
269}
270
271bool ContextTracker::isCompletionValid(const std::string& completion) const
272{
273 bool result = false;
274
275 std::string prefix = getPrefix();
276 prefix = Utility::strtolower(prefix); // no need to be case sensitive
277 if (completion.find(prefix) == 0) {
278 result = true;
279 }
280
281 return result;
282}
283
284bool ContextTracker::isWordChar(const char c) const
285{
286 if(wordChars.find(c, 0) != std::string::npos)
287 return true;
288 else
289 return false;
290}
291
292bool ContextTracker::isSeparatorChar(const char c) const
293{
294 if(separatorChars.find(c, 0) != std::string::npos)
295 return true;
296 else
297 return false;
298}
299
300bool ContextTracker::isBlankspaceChar(const char c) const
301{
302 if(blankspaceChars.find(c, 0) != std::string::npos)
303 return true;
304 else
305 return false;
306}
307
308bool ContextTracker::isControlChar(const char c) const
309{
310 if(controlChars.find(c, 0) != std::string::npos)
311 return true;
312 else
313 return false;
314}
315
317{
318 return wordChars;
319}
320
322{
323 return separatorChars;
324}
325
327{
328 return blankspaceChars;
329}
330
332{
333 return controlChars;
334}
335
336std::string ContextTracker::toString() const
337{
339}
340
341void ContextTracker::update (const Observable* variable)
342{
343 logger << DEBUG << "Notification received: "
344 << variable->get_name () << " - " << variable->get_value () << endl;
345
346 dispatcher.dispatch (variable);
347}
void update_sliding_window(const std::string &str)
void set_sliding_window_size(const std::string &str)
bool context_change(const std::string &past_stream) const
std::string change(const std::string &past_stream) const
void set_logger(const std::string &value)
static const char * LOWERCASE_MODE
std::string blankspaceChars
std::string getExtraTokenToLearn(const int index, const std::vector< std::string > &change) const
bool isControlChar(const char) const
static const char * LOGGER
bool isSeparatorChar(const char) const
std::string getControlChars() const
std::string getPastStream() const
Logger< char > logger
static const char * SLIDING_WINDOW_SIZE
std::string controlChars
void set_online_learning(const std::string &value)
std::string toString() const
bool isCompletionValid(const std::string &) const
std::string getWordChars() const
bool isWordChar(const char) const
static const char * ONLINE_LEARNING
std::string getBlankspaceChars() const
const PresageCallback * callback(const PresageCallback *callback)
std::string separatorChars
ContextChangeDetector * contextChangeDetector
std::string getSeparatorChars() const
void set_sliding_window_size(const std::string &value)
bool isBlankspaceChar(const char) const
std::string getToken(const int) const
std::string getPrefix() const
ContextTracker(Configuration *config, PredictorRegistry *predictorRegistry, PresageCallback *callback, const unsigned char[]=DEFAULT_WORD_CHARS, const char[]=DEFAULT_SEPARATOR_CHARS, const char[]=DEFAULT_BLANKSPACE_CHARS, const char[]=DEFAULT_CONTROL_CHARS)
PredictorRegistry * predictorRegistry
const PresageCallback * context_tracker_callback
void set_lowercase_mode(const std::string &value)
void learn(const std::string &text) const
Learn from text.
Dispatcher< ContextTracker > dispatcher
std::string getFutureStream() const
std::string wordChars
void dispatch(const Observable *var)
Definition: dispatcher.h:73
void map(Observable *var, const mbr_func_ptr_t &ptr)
Definition: dispatcher.h:62
virtual bool hasMoreTokens() const
virtual std::string nextToken()
virtual std::string get_name() const =0
virtual std::string get_value() const =0
void setContextTracker(ContextTracker *ct)
virtual void learn(const std::vector< std::string > &change)=0
virtual std::string get_future_stream() const =0
virtual std::string get_past_stream() const =0
virtual bool hasMoreTokens() const
virtual std::string nextToken()
void lowercaseMode(const bool)
Definition: tokenizer.cpp:81
static bool isYes(const char *)
Definition: utility.cpp:185
static char * strtolower(char *)
Definition: utility.cpp:42
_SetLevel setlevel(std::string __l)
Manipulator for level.
Definition: logger.h:46
const Logger< _charT, _Traits > & endl(const Logger< _charT, _Traits > &lgr)
Definition: logger.h:278
std::string config
Definition: presageDemo.cpp:70
@ PRESAGE_INVALID_CALLBACK_ERROR