blob: 34d8ba88cbd1d06f2275e09b9ce334eff669dce2 [file] [log] [blame]
Ben Murdoch2385ea32013-08-06 11:01:04 +01001// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "chrome/browser/devtools/devtools_file_system_indexer.h"
6
7#include <iterator>
8
9#include "base/bind.h"
10#include "base/callback.h"
11#include "base/file_util.h"
12#include "base/files/file_enumerator.h"
13#include "base/files/file_util_proxy.h"
14#include "base/lazy_instance.h"
15#include "base/logging.h"
16#include "base/platform_file.h"
17#include "base/strings/utf_string_conversions.h"
18#include "content/public/browser/browser_thread.h"
19
20using base::Bind;
21using base::Callback;
22using base::FileEnumerator;
23using base::FilePath;
24using base::FileUtilProxy;
25using base::Time;
26using base::TimeDelta;
27using base::TimeTicks;
28using base::PassPlatformFile;
29using base::PlatformFile;
30using base::PlatformFileError;
31using base::PlatformFileInfo;
32using content::BrowserThread;
33using std::map;
34using std::set;
35using std::string;
36using std::vector;
37
38namespace {
39
40typedef int32 Trigram;
41typedef char TrigramChar;
42typedef uint16 FileId;
43
44const int kMinTimeoutBetweenWorkedNitification = 200;
45// Trigram characters include all ASCII printable characters (32-126) except for
46// the capital letters, because the index is case insensitive.
47const size_t kTrigramCharacterCount = 126 - 'Z' - 1 + 'A' - ' ' + 1;
48const size_t kTrigramCount =
49 kTrigramCharacterCount * kTrigramCharacterCount * kTrigramCharacterCount;
50const int kMaxReadLength = 10 * 1024;
51const TrigramChar kUndefinedTrigramChar = -1;
52const Trigram kUndefinedTrigram = -1;
53
54base::LazyInstance<vector<bool> >::Leaky g_is_binary_char =
55 LAZY_INSTANCE_INITIALIZER;
56
57base::LazyInstance<vector<TrigramChar> >::Leaky g_trigram_chars =
58 LAZY_INSTANCE_INITIALIZER;
59
60class Index {
61 public:
62 Index();
63 Time LastModifiedTimeForFile(const FilePath& file_path);
64 void SetTrigramsForFile(const FilePath& file_path,
65 const vector<Trigram>& index,
66 const Time& time);
67 vector<FilePath> Search(string query);
68 void PrintStats();
69 void NormalizeVectors();
70
71 private:
72 ~Index();
73
74 FileId GetFileId(const FilePath& file_path);
75
76 typedef map<FilePath, FileId> FileIdsMap;
77 FileIdsMap file_ids_;
78 FileId last_file_id_;
79 // The index in this vector is the trigram id.
80 vector<vector<FileId> > index_;
81 typedef map<FilePath, Time> IndexedFilesMap;
82 IndexedFilesMap index_times_;
83 vector<bool> is_normalized_;
84
85 DISALLOW_COPY_AND_ASSIGN(Index);
86};
87
88base::LazyInstance<Index>::Leaky g_trigram_index = LAZY_INSTANCE_INITIALIZER;
89
90void InitIsBinaryCharMap() {
91 for (size_t i = 0; i < 256; ++i) {
92 unsigned char ch = i;
93 bool is_binary_char = ch < 9 || (ch >= 14 && ch < 32) || ch == 127;
94 g_is_binary_char.Get().push_back(is_binary_char);
95 }
96}
97
98bool IsBinaryChar(char c) {
99 unsigned char uc = static_cast<unsigned char>(c);
100 return g_is_binary_char.Get()[uc];
101}
102
103void InitTrigramCharsMap() {
104 for (size_t i = 0; i < 256; ++i) {
105 if (i > 127) {
106 g_trigram_chars.Get().push_back(kUndefinedTrigramChar);
107 continue;
108 }
109 char ch = i;
110 if (ch == '\t')
111 ch = ' ';
112 if (ch >= 'A' && ch <= 'Z')
113 ch = ch - 'A' + 'a';
114 if ((IsBinaryChar(ch)) || (ch < ' ')) {
115 g_trigram_chars.Get().push_back(kUndefinedTrigramChar);
116 continue;
117 }
118
119 if (ch >= 'Z')
120 ch = ch - 'Z' - 1 + 'A';
121 ch -= ' ';
122 char signed_trigram_char_count = static_cast<char>(kTrigramCharacterCount);
123 CHECK(ch >= 0 && ch < signed_trigram_char_count);
124 g_trigram_chars.Get().push_back(ch);
125 }
126}
127
128TrigramChar TrigramCharForChar(char c) {
129 unsigned char uc = static_cast<unsigned char>(c);
130 return g_trigram_chars.Get()[uc];
131}
132
133Trigram TrigramAtIndex(const vector<TrigramChar>& trigram_chars, size_t index) {
134 static int kTrigramCharacterCountSquared =
135 kTrigramCharacterCount * kTrigramCharacterCount;
136 if (trigram_chars[index] == kUndefinedTrigramChar ||
137 trigram_chars[index + 1] == kUndefinedTrigramChar ||
138 trigram_chars[index + 2] == kUndefinedTrigramChar)
139 return kUndefinedTrigram;
140 Trigram trigram = kTrigramCharacterCountSquared * trigram_chars[index] +
141 kTrigramCharacterCount * trigram_chars[index + 1] +
142 trigram_chars[index + 2];
143 return trigram;
144}
145
146Index::Index() : last_file_id_(0) {
147 index_.resize(kTrigramCount);
148 is_normalized_.resize(kTrigramCount);
149 std::fill(is_normalized_.begin(), is_normalized_.end(), true);
150}
151
152Index::~Index() {}
153
154Time Index::LastModifiedTimeForFile(const FilePath& file_path) {
155 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
156 Time last_modified_time;
157 if (index_times_.find(file_path) != index_times_.end())
158 last_modified_time = index_times_[file_path];
159 return last_modified_time;
160}
161
162void Index::SetTrigramsForFile(const FilePath& file_path,
163 const vector<Trigram>& index,
164 const Time& time) {
165 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
166 FileId file_id = GetFileId(file_path);
167 vector<Trigram>::const_iterator it = index.begin();
168 for (; it != index.end(); ++it) {
169 Trigram trigram = *it;
170 index_[trigram].push_back(file_id);
171 is_normalized_[trigram] = false;
172 }
173 index_times_[file_path] = time;
174}
175
176vector<FilePath> Index::Search(string query) {
177 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
178 const char* data = query.c_str();
179 vector<TrigramChar> trigram_chars;
180 trigram_chars.reserve(query.size());
181 for (size_t i = 0; i < query.size(); ++i)
182 trigram_chars.push_back(TrigramCharForChar(data[i]));
183 vector<Trigram> trigrams;
184 for (size_t i = 0; i + 2 < query.size(); ++i) {
185 Trigram trigram = TrigramAtIndex(trigram_chars, i);
186 if (trigram != kUndefinedTrigram)
187 trigrams.push_back(trigram);
188 }
189 set<FileId> file_ids;
190 bool first = true;
191 vector<Trigram>::const_iterator it = trigrams.begin();
192 for (; it != trigrams.end(); ++it) {
193 Trigram trigram = *it;
194 if (first) {
195 std::copy(index_[trigram].begin(),
196 index_[trigram].end(),
197 std::inserter(file_ids, file_ids.begin()));
198 first = false;
199 continue;
200 }
201 set<FileId> intersection;
202 std::set_intersection(file_ids.begin(),
203 file_ids.end(),
204 index_[trigram].begin(),
205 index_[trigram].end(),
206 std::inserter(intersection, intersection.begin()));
207 file_ids.swap(intersection);
208 }
209 vector<FilePath> result;
210 FileIdsMap::const_iterator ids_it = file_ids_.begin();
211 for (; ids_it != file_ids_.end(); ++ids_it) {
212 if (trigrams.size() == 0 ||
213 file_ids.find(ids_it->second) != file_ids.end()) {
214 result.push_back(ids_it->first);
215 }
216 }
217 return result;
218}
219
220FileId Index::GetFileId(const FilePath& file_path) {
221 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
222 string file_path_str = file_path.AsUTF8Unsafe();
223 if (file_ids_.find(file_path) != file_ids_.end())
224 return file_ids_[file_path];
225 file_ids_[file_path] = ++last_file_id_;
226 return last_file_id_;
227}
228
229void Index::NormalizeVectors() {
230 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
231 for (size_t i = 0; i < kTrigramCount; ++i) {
232 if (!is_normalized_[i]) {
233 std::sort(index_[i].begin(), index_[i].end());
234 if (index_[i].capacity() > index_[i].size())
235 vector<FileId>(index_[i]).swap(index_[i]);
236 is_normalized_[i] = true;
237 }
238 }
239}
240
241void Index::PrintStats() {
242 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
243 LOG(ERROR) << "Index stats:";
244 size_t size = 0;
245 size_t maxSize = 0;
246 size_t capacity = 0;
247 for (size_t i = 0; i < kTrigramCount; ++i) {
248 if (index_[i].size() > maxSize)
249 maxSize = index_[i].size();
250 size += index_[i].size();
251 capacity += index_[i].capacity();
252 }
253 LOG(ERROR) << " - total trigram count: " << size;
254 LOG(ERROR) << " - max file count per trigram: " << maxSize;
255 LOG(ERROR) << " - total vectors capacity " << capacity;
256 size_t total_index_size =
257 capacity * sizeof(FileId) + sizeof(vector<FileId>) * kTrigramCount;
258 LOG(ERROR) << " - estimated total index size " << total_index_size;
259}
260
261typedef Callback<void(bool, const vector<bool>&)> IndexerCallback;
262
263} // namespace
264
265DevToolsFileSystemIndexer::FileSystemIndexingJob::FileSystemIndexingJob(
266 const FilePath& file_system_path,
267 const TotalWorkCallback& total_work_callback,
268 const WorkedCallback& worked_callback,
269 const DoneCallback& done_callback)
270 : file_system_path_(file_system_path),
271 total_work_callback_(total_work_callback),
272 worked_callback_(worked_callback),
273 done_callback_(done_callback),
274 files_indexed_(0),
275 stopped_(false) {
276 current_trigrams_set_.resize(kTrigramCount);
277 current_trigrams_.reserve(kTrigramCount);
278}
279
280DevToolsFileSystemIndexer::FileSystemIndexingJob::~FileSystemIndexingJob() {}
281
282void DevToolsFileSystemIndexer::FileSystemIndexingJob::Start() {
283 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
284 BrowserThread::PostTask(
285 BrowserThread::FILE,
286 FROM_HERE,
287 Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
288}
289
290void DevToolsFileSystemIndexer::FileSystemIndexingJob::Stop() {
291 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
292 BrowserThread::PostTask(BrowserThread::FILE,
293 FROM_HERE,
294 Bind(&FileSystemIndexingJob::StopOnFileThread, this));
295}
296
297void DevToolsFileSystemIndexer::FileSystemIndexingJob::StopOnFileThread() {
298 stopped_ = true;
299}
300
301void DevToolsFileSystemIndexer::FileSystemIndexingJob::CollectFilesToIndex() {
302 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
303 if (stopped_)
304 return;
305 if (!file_enumerator_) {
306 file_enumerator_.reset(
307 new FileEnumerator(file_system_path_, true, FileEnumerator::FILES));
308 }
309 FilePath file_path = file_enumerator_->Next();
310 if (file_path.empty()) {
311 BrowserThread::PostTask(
312 BrowserThread::UI,
313 FROM_HERE,
314 Bind(total_work_callback_, file_path_times_.size()));
315 indexing_it_ = file_path_times_.begin();
316 IndexFiles();
317 return;
318 }
319 Time saved_last_modified_time =
320 g_trigram_index.Get().LastModifiedTimeForFile(file_path);
321 FileEnumerator::FileInfo file_info = file_enumerator_->GetInfo();
322 Time current_last_modified_time = file_info.GetLastModifiedTime();
323 if (current_last_modified_time > saved_last_modified_time) {
324 file_path_times_[file_path] = current_last_modified_time;
325 }
326 BrowserThread::PostTask(
327 BrowserThread::FILE,
328 FROM_HERE,
329 Bind(&FileSystemIndexingJob::CollectFilesToIndex, this));
330}
331
332void DevToolsFileSystemIndexer::FileSystemIndexingJob::IndexFiles() {
333 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
334 if (stopped_)
335 return;
336 if (indexing_it_ == file_path_times_.end()) {
337 g_trigram_index.Get().NormalizeVectors();
338 BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, done_callback_);
339 return;
340 }
341 FilePath file_path = indexing_it_->first;
342 FileUtilProxy::CreateOrOpen(
343 BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(),
344 file_path,
Ben Murdochbb1529c2013-08-08 10:24:53 +0100345 base::PLATFORM_FILE_OPEN | base::PLATFORM_FILE_READ,
Ben Murdoch2385ea32013-08-06 11:01:04 +0100346 Bind(&FileSystemIndexingJob::StartFileIndexing, this));
347}
348
349void DevToolsFileSystemIndexer::FileSystemIndexingJob::StartFileIndexing(
350 PlatformFileError error,
351 PassPlatformFile pass_file,
352 bool) {
353 if (error != base::PLATFORM_FILE_OK) {
354 current_file_ = base::kInvalidPlatformFileValue;
355 FinishFileIndexing(false);
356 return;
357 }
358 current_file_ = pass_file.ReleaseValue();
359 current_file_offset_ = 0;
360 current_trigrams_.clear();
361 std::fill(current_trigrams_set_.begin(), current_trigrams_set_.end(), false);
362 ReadFromFile();
363}
364
365void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReadFromFile() {
366 if (stopped_) {
367 CloseFile();
368 return;
369 }
370 FileUtilProxy::Read(
371 BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(),
372 current_file_,
373 current_file_offset_,
374 kMaxReadLength,
375 Bind(&FileSystemIndexingJob::OnRead, this));
376}
377
378void DevToolsFileSystemIndexer::FileSystemIndexingJob::OnRead(
379 PlatformFileError error,
380 const char* data,
381 int bytes_read) {
382 if (error != base::PLATFORM_FILE_OK) {
383 FinishFileIndexing(false);
384 return;
385 }
386
387 if (!bytes_read || bytes_read < 3) {
388 FinishFileIndexing(true);
389 return;
390 }
391
392 size_t size = static_cast<size_t>(bytes_read);
393 vector<TrigramChar> trigram_chars;
394 trigram_chars.reserve(size);
395 for (size_t i = 0; i < size; ++i) {
396 if (IsBinaryChar(data[i])) {
397 current_trigrams_.clear();
398 FinishFileIndexing(true);
399 return;
400 }
401 trigram_chars.push_back(TrigramCharForChar(data[i]));
402 }
403
404 for (size_t i = 0; i + 2 < size; ++i) {
405 Trigram trigram = TrigramAtIndex(trigram_chars, i);
406 if ((trigram != kUndefinedTrigram) && !current_trigrams_set_[trigram]) {
407 current_trigrams_set_[trigram] = true;
408 current_trigrams_.push_back(trigram);
409 }
410 }
411 current_file_offset_ += bytes_read - 2;
412 ReadFromFile();
413}
414
415void DevToolsFileSystemIndexer::FileSystemIndexingJob::FinishFileIndexing(
416 bool success) {
417 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
418 CloseFile();
419 if (success) {
420 FilePath file_path = indexing_it_->first;
421 g_trigram_index.Get().SetTrigramsForFile(
422 file_path, current_trigrams_, file_path_times_[file_path]);
423 }
424 ReportWorked();
425 ++indexing_it_;
426 IndexFiles();
427}
428
429void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseFile() {
430 if (current_file_ != base::kInvalidPlatformFileValue) {
431 FileUtilProxy::Close(
432 BrowserThread::GetMessageLoopProxyForThread(BrowserThread::FILE).get(),
433 current_file_,
434 Bind(&FileSystemIndexingJob::CloseCallback, this));
435 }
436}
437
438void DevToolsFileSystemIndexer::FileSystemIndexingJob::CloseCallback(
439 PlatformFileError error) {}
440
441void DevToolsFileSystemIndexer::FileSystemIndexingJob::ReportWorked() {
442 TimeTicks current_time = TimeTicks::Now();
443 bool should_send_worked_nitification = true;
444 if (!last_worked_notification_time_.is_null()) {
445 TimeDelta delta = current_time - last_worked_notification_time_;
446 if (delta.InMilliseconds() < kMinTimeoutBetweenWorkedNitification)
447 should_send_worked_nitification = false;
448 }
449 ++files_indexed_;
450 if (should_send_worked_nitification) {
451 last_worked_notification_time_ = current_time;
452 BrowserThread::PostTask(
453 BrowserThread::UI, FROM_HERE, Bind(worked_callback_, files_indexed_));
454 files_indexed_ = 0;
455 }
456}
457
458DevToolsFileSystemIndexer::DevToolsFileSystemIndexer() {
459 static bool maps_initialized = false;
460 if (!maps_initialized) {
461 InitIsBinaryCharMap();
462 InitTrigramCharsMap();
463 maps_initialized = true;
464 }
465}
466
467DevToolsFileSystemIndexer::~DevToolsFileSystemIndexer() {}
468
469scoped_refptr<DevToolsFileSystemIndexer::FileSystemIndexingJob>
470DevToolsFileSystemIndexer::IndexPath(
471 const string& file_system_path,
472 const TotalWorkCallback& total_work_callback,
473 const WorkedCallback& worked_callback,
474 const DoneCallback& done_callback) {
475 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
476 scoped_refptr<FileSystemIndexingJob> indexing_job =
477 new FileSystemIndexingJob(FilePath::FromUTF8Unsafe(file_system_path),
478 total_work_callback,
479 worked_callback,
480 done_callback);
481 indexing_job->Start();
482 return indexing_job;
483}
484
485void DevToolsFileSystemIndexer::SearchInPath(const string& file_system_path,
486 const string& query,
487 const SearchCallback& callback) {
488 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::UI));
489 BrowserThread::PostTask(
490 BrowserThread::FILE,
491 FROM_HERE,
492 Bind(&DevToolsFileSystemIndexer::SearchInPathOnFileThread,
493 this,
494 file_system_path,
495 query,
496 callback));
497}
498
499void DevToolsFileSystemIndexer::SearchInPathOnFileThread(
500 const string& file_system_path,
501 const string& query,
502 const SearchCallback& callback) {
503 DCHECK(BrowserThread::CurrentlyOn(BrowserThread::FILE));
504 vector<FilePath> file_paths = g_trigram_index.Get().Search(query);
505 vector<string> result;
506 FilePath path = FilePath::FromUTF8Unsafe(file_system_path);
507 vector<FilePath>::const_iterator it = file_paths.begin();
508 for (; it != file_paths.end(); ++it) {
509 if (path.IsParent(*it))
510 result.push_back(it->AsUTF8Unsafe());
511 }
512 BrowserThread::PostTask(BrowserThread::UI, FROM_HERE, Bind(callback, result));
513}