MultiIndex
|
00001 00003 // Copyright 2012 Yandex Artem Babenko 00004 #ifndef INDEXER_H_ 00005 #define INDEXER_H_ 00006 00007 #include <ctime> 00008 #include <map> 00009 00010 #include <boost/archive/binary_iarchive.hpp> 00011 #include <boost/archive/binary_oarchive.hpp> 00012 00013 #include <boost/algorithm/string/split.hpp> 00014 #include <boost/algorithm/string.hpp> 00015 00016 #include <boost/lexical_cast.hpp> 00017 00018 #include <boost/serialization/serialization.hpp> 00019 #include <boost/serialization/set.hpp> 00020 #include <boost/serialization/vector.hpp> 00021 00022 #include "data_util.h" 00023 #include "multitable.hpp" 00024 00025 00026 using std::ifstream; 00027 using std::map; 00028 using std::multimap; 00029 using std::ofstream; 00030 using std::string; 00031 00032 using boost::lexical_cast; 00033 using boost::split; 00034 00035 extern int THREADS_COUNT; 00036 00037 extern Dimensions SPACE_DIMENSION; 00038 00039 extern enum PointType point_type; 00040 00041 IndexConfig gConfig; 00042 00048 template<class Record> 00049 class MultiIndexer { 00050 public: 00055 MultiIndexer(const int multiplicity = 2); 00067 void BuildMultiIndex(const string& points_filename, 00068 const string& metainfo_filename, 00069 const int points_count, 00070 const vector<Centroids>& coarse_vocabs, 00071 const vector<Centroids>& fine_vocabs, 00072 const RerankMode& mode, 00073 const bool build_coarse_quantization, 00074 const string& files_prefix, 00075 const string& coarse_quantization_filename = ""); 00076 private: 00083 void PrepareCoarseQuantization(const string& points_filename, 00084 const int points_count, 00085 const vector<Centroids>& coarse_vocabs); 00094 void GetCoarseQuantizationsForSubset(const string& points_filename, 00095 const int start_pid, 00096 const int subset_size, 00097 const vector<Centroids>& coarse_vocabs, 00098 vector<vector<ClusterId> >* 00099 transposed_coarse_quantizations); 00106 void SerializeCoarseQuantizations(const vector<vector<ClusterId> >& 00107 transposed_coarse_quantizations, 00108 const string& filename); 00113 void SerializeMultiIndexFiles(); 00117 void ConvertPointsInCellsCountToCellEdges(); 00118 00127 void FillMultiIndex(const string& points_filename, 00128 const int points_count, 00129 const vector<Centroids>& coarse_vocabs, 00130 const vector<Centroids>& fine_vocabs, 00131 const RerankMode& mode); 00142 void FillMultiIndexForSubset(const string& points_filename, 00143 const PointId start_pid, 00144 const int points_count, 00145 const vector<Centroids>& coarse_vocabs, 00146 const vector<Centroids>& fine_vocabs, 00147 const RerankMode& mode, 00148 Multitable<int>* points_written_in_index); 00149 00156 void GetPointCoarseQuantization(const PointId pid, 00157 const string& filename, 00158 vector<ClusterId>* coarse_quantization); 00165 void FillPointRerankInfo(const Point& point, 00166 const PointId pid, 00167 const vector<Centroids>& fine_vocabs); 00175 void RestorePointsInCellsCountFromCourseQuantization(const string& points_filename, 00176 const int points_count, 00177 const vector<Centroids>& coarse_vocabs); 00181 int GetInputCoordSizeof(); 00187 void ReadPoint(ifstream& input, Point* point); 00192 void InitBlasStructures(const vector<Centroids>& coarse_vocabs); 00196 string files_prefix_; 00200 string coarse_quantization_filename_; 00204 int multiplicity_; 00208 Multitable<int> point_in_cells_count_; 00212 MultiIndex<Record> multiindex_; 00216 boost::mutex cell_counts_mutex_; 00220 vector<float*> coarse_vocabs_matrices_; 00224 vector<vector<float> > coarse_centroids_norms_; 00225 }; 00226 00227 template<class Record> 00228 inline void GetRecord(const Point& point, const PointId pid, 00229 const vector<ClusterId> coarse_quantization, 00230 const vector<Centroids>& coarse_vocabs, 00231 Record* result) { 00232 } 00233 00234 template<class Record> 00235 void InitParameters(const vector<Centroids>& fine_vocabs, 00236 const RerankMode& mode, 00237 const string& metainfo_filename) { 00238 gConfig.fine_vocabs = fine_vocabs; 00239 gConfig.rerank_mode = mode; 00240 } 00241 00242 00244 template<class Record> 00245 MultiIndexer<Record>::MultiIndexer(const int multiplicity) { 00246 if(multiplicity < 0) { 00247 throw std::logic_error("Multiplicity < 0"); 00248 } 00249 multiplicity_ = multiplicity; 00250 } 00251 00252 template<class Record> 00253 int MultiIndexer<Record>::GetInputCoordSizeof() { 00254 if(point_type == FVEC) { 00255 return (int)sizeof(float); 00256 } else if(point_type == BVEC) { 00257 return (int)sizeof(unsigned char); 00258 } 00259 } 00260 00261 template<class Record> 00262 void MultiIndexer<Record>::ReadPoint(ifstream& input, Point* point) { 00263 if(!input.good()) { 00264 throw std::logic_error("Bad input stream"); 00265 } 00266 if(point_type == FVEC) { 00267 ReadVector<float, Coord>(input, point); 00268 } else if(point_type == BVEC) { 00269 ReadVector<unsigned char, Coord>(input, point); 00270 } 00271 } 00272 00273 template<class Record> 00274 void MultiIndexer<Record>::SerializeCoarseQuantizations(const vector<vector<ClusterId> >& 00275 transposed_coarse_quantizations, 00276 const string& filename) { 00277 ofstream quantizations_stream; 00278 quantizations_stream.open(filename.c_str(), ios::binary); 00279 if(!quantizations_stream.good()) { 00280 throw std::logic_error("Bad input stream"); 00281 } 00282 cout << "Writing coarse quantizations started" << endl; 00283 for(PointId pid = 0; pid < transposed_coarse_quantizations[0].size(); ++pid) { 00284 for(int subspace_index = 0; subspace_index < multiplicity_; ++subspace_index) { 00285 ClusterId quantization = transposed_coarse_quantizations[subspace_index][pid]; 00286 quantizations_stream.write((char*)&quantization, sizeof(quantization)); 00287 } 00288 } 00289 quantizations_stream.close(); 00290 cout << "Writing coarse quantizations started" << endl; 00291 } 00292 00293 template<class Record> 00294 void MultiIndexer<Record>::SerializeMultiIndexFiles() { 00295 cout << "Start multiindex serializing....\n"; 00296 ofstream cell_edges(string(files_prefix_ + "_cell_edges.bin").c_str(), ios::binary); 00297 boost::archive::binary_oarchive arc_cell_edges(cell_edges); 00298 arc_cell_edges << multiindex_.cell_edges; 00299 ofstream multi_array(string(files_prefix_ + "_multi_array.bin").c_str(), ios::binary); 00300 boost::archive::binary_oarchive arc_multi_array(multi_array); 00301 arc_multi_array << multiindex_.multiindex; 00302 cout << "Finish multiindex serializing....\n"; 00303 } 00304 00305 template<class Record> 00306 void MultiIndexer<Record>::GetCoarseQuantizationsForSubset(const string& points_filename, 00307 const int start_pid, 00308 const int subset_size, 00309 const vector<Centroids>& coarse_vocabs, 00310 vector<vector<ClusterId> >* 00311 transposed_coarse_quantizations) { 00312 ifstream point_stream; 00313 point_stream.open(points_filename.c_str(), ios::binary); 00314 if(!point_stream.good()) { 00315 throw std::logic_error("Bad input points stream"); 00316 } 00317 // we assume points are stored in .fvecs or .bvecs format 00318 point_stream.seekg(start_pid * (GetInputCoordSizeof() * SPACE_DIMENSION + sizeof(Dimensions)), ios::beg); 00319 vector<ClusterId> coarse_quantization(multiplicity_); 00320 for(int point_number = 0; point_number < subset_size; ++point_number) { 00321 if(point_number % 10000 == 0) { 00322 cout << "Getting coarse quantization, point # " << start_pid + point_number << endl; 00323 } 00324 Point current_point; 00325 ReadPoint(point_stream, ¤t_point); 00326 int subpoints_dimension = SPACE_DIMENSION / multiplicity_; 00327 for(int coarse_index = 0; coarse_index < multiplicity_; ++coarse_index) { 00328 Dimensions start_dim = coarse_index * subpoints_dimension; 00329 Dimensions final_dim = start_dim + subpoints_dimension; 00330 ClusterId nearest = GetNearestClusterId(current_point, coarse_vocabs.at(coarse_index), 00331 start_dim, final_dim); 00332 transposed_coarse_quantizations->at(coarse_index)[start_pid + point_number] = nearest; 00333 coarse_quantization[coarse_index] = nearest; 00334 } 00335 int global_index = point_in_cells_count_.GetCellGlobalIndex(coarse_quantization); 00336 cell_counts_mutex_.lock(); 00337 ++(point_in_cells_count_.table[global_index]); 00338 cell_counts_mutex_.unlock(); 00339 } 00340 } 00341 00342 template<class Record> 00343 void MultiIndexer<Record>::PrepareCoarseQuantization(const string& points_filename, 00344 const int points_count, 00345 const vector<Centroids>& coarse_vocabs) { 00346 // we use transposed quantizations for efficient memory usage 00347 vector<vector<ClusterId> > transposed_coarse_quantizations; 00348 transposed_coarse_quantizations.resize(multiplicity_); 00349 vector<int> multiindex_table_dimensions; 00350 for(int i = 0; i < multiplicity_; ++i) { 00351 transposed_coarse_quantizations[i].resize(points_count); 00352 multiindex_table_dimensions.push_back(coarse_vocabs[i].size()); 00353 } 00354 point_in_cells_count_.Resize(multiindex_table_dimensions); 00355 cout << "Memory for coarse quantizations allocated" << endl; 00356 boost::thread_group index_threads; 00357 int thread_points_count = points_count / THREADS_COUNT; 00358 for(int thread_id = 0; thread_id < THREADS_COUNT; ++thread_id) { 00359 PointId start_pid = thread_points_count * thread_id; 00360 index_threads.create_thread(boost::bind(&MultiIndexer::GetCoarseQuantizationsForSubset, 00361 this, points_filename, start_pid, thread_points_count, 00362 boost::cref(coarse_vocabs), &transposed_coarse_quantizations)); 00363 } 00364 index_threads.join_all(); 00365 if(coarse_quantization_filename_.empty()) { 00366 coarse_quantization_filename_ = files_prefix_ + "_coarse_quantizations.bin"; 00367 } 00368 cout << "Coarse quantizations are calculated" << endl; 00369 SerializeCoarseQuantizations(transposed_coarse_quantizations, coarse_quantization_filename_); 00370 cout << "Coarse quantizations are serialized" << endl; 00371 } 00372 00373 template<class Record> 00374 void MultiIndexer<Record>::ConvertPointsInCellsCountToCellEdges() { 00375 cout << "Converting points in cells to cell edges...\n"; 00376 multiindex_.cell_edges = point_in_cells_count_; 00377 multiindex_.cell_edges.table[0] = 0; 00378 for(int global_index = 1; 00379 global_index < point_in_cells_count_.table.size(); 00380 ++global_index) { 00381 multiindex_.cell_edges.table[global_index] = multiindex_.cell_edges.table[global_index - 1] + 00382 point_in_cells_count_.table[global_index - 1]; 00383 } 00384 // we do not need this table more 00385 point_in_cells_count_.table.clear(); 00386 cout << "Finish converting points in cells to cell edges...\n"; 00387 } 00388 00389 template<class Record> 00390 void MultiIndexer<Record>::GetPointCoarseQuantization(const PointId pid, 00391 const string& filename, 00392 vector<ClusterId>* coarse_quantization) { 00393 ifstream coarse_quantization_stream; 00394 coarse_quantization_stream.open(filename.c_str(), ios::binary); 00395 if(!coarse_quantization_stream.good()) { 00396 throw std::logic_error("Bad input coarse quantizations stream"); 00397 } 00398 coarse_quantization_stream.seekg((long long)pid * sizeof(ClusterId) * multiplicity_, ios::beg); 00399 for(int coarse_index = 0; coarse_index < multiplicity_; ++coarse_index) { 00400 coarse_quantization_stream.read((char*)&(coarse_quantization->at(coarse_index)), 00401 sizeof(coarse_quantization->at(coarse_index))); 00402 } 00403 } 00404 00405 template<class Record> 00406 void MultiIndexer<Record>::FillMultiIndexForSubset(const string& points_filename, 00407 const PointId start_pid, 00408 const int points_count, 00409 const vector<Centroids>& coarse_vocabs, 00410 const vector<Centroids>& fine_vocabs, 00411 const RerankMode& mode, 00412 Multitable<int>* points_written_in_index) { 00413 ifstream point_stream; 00414 point_stream.open(points_filename.c_str(), ios::binary); 00415 if(!point_stream.good()) { 00416 throw std::logic_error("Bad input points stream"); 00417 } 00418 point_stream.seekg((long long)start_pid * (GetInputCoordSizeof() * SPACE_DIMENSION + sizeof(Dimensions)), ios::beg); 00419 for(int point_number = 0; point_number < points_count; ++point_number) { 00420 if(point_number % 10000 == 0) { 00421 cout << "Filling multiindex, point # " << start_pid + point_number << endl; 00422 } 00423 Point current_point; 00424 ReadPoint(point_stream, ¤t_point); 00425 vector<ClusterId> coarse_quantization(multiplicity_); 00426 GetPointCoarseQuantization(start_pid + point_number, 00427 coarse_quantization_filename_, 00428 &coarse_quantization); 00429 int current_written_count = points_written_in_index->GetValue(coarse_quantization); 00430 int pid_multiindex = multiindex_.cell_edges.GetValue(coarse_quantization) + current_written_count; 00431 GetRecord<Record>(current_point, start_pid + point_number, 00432 coarse_quantization, coarse_vocabs, &(multiindex_.multiindex[pid_multiindex])); 00433 cell_counts_mutex_.lock(); 00434 points_written_in_index->SetValue(current_written_count + 1, coarse_quantization); 00435 cell_counts_mutex_.unlock(); 00436 } 00437 } 00438 00439 template<class Record> 00440 void MultiIndexer<Record>::FillMultiIndex(const string& points_filename, 00441 const int points_count, 00442 const vector<Centroids>& coarse_vocabs, 00443 const vector<Centroids>& fine_vocabs, 00444 const RerankMode& mode) { 00445 ConvertPointsInCellsCountToCellEdges(); 00446 multiindex_.multiindex.resize(points_count); 00447 cout << "Indexing started..." << endl; 00448 00449 Multitable<int> points_written_in_index(multiindex_.cell_edges.dimensions); 00450 int thread_points_count = points_count / THREADS_COUNT; 00451 boost::thread_group threads; 00452 for(int thread_id = 0; thread_id < THREADS_COUNT; ++thread_id) { 00453 PointId start_pid = thread_points_count * thread_id; 00454 threads.create_thread(boost::bind(&MultiIndexer::FillMultiIndexForSubset, this, points_filename, start_pid, 00455 thread_points_count, boost::cref(coarse_vocabs), 00456 boost::cref(fine_vocabs), mode, &points_written_in_index)); 00457 } 00458 threads.join_all(); 00459 cout << "Indexing finished..." << endl; 00460 } 00461 00462 template<class Record> 00463 void MultiIndexer<Record>::RestorePointsInCellsCountFromCourseQuantization(const string& points_filename, 00464 const int points_count, 00465 const vector<Centroids>& coarse_vocabs) { 00466 vector<int> dimensions; 00467 for(int i = 0; i < multiplicity_; ++i) { 00468 dimensions.push_back(coarse_vocabs[i].size()); 00469 } 00470 point_in_cells_count_.Resize(dimensions); 00471 ifstream coarse_quantization_stream; 00472 coarse_quantization_stream.open(coarse_quantization_filename_.c_str(), ios::binary); 00473 if(!coarse_quantization_stream.good()) { 00474 throw std::logic_error("Bad input coarse quantizations stream"); 00475 } 00476 CoarseQuantization quantization(multiplicity_); 00477 for(PointId pid = 0; pid < points_count; ++pid) { 00478 if(pid % 100000 == 0) { 00479 cout << pid << endl; 00480 } 00481 for(int subspace_index = 0; subspace_index < multiplicity_; ++subspace_index) { 00482 coarse_quantization_stream.read((char*)&(quantization[subspace_index]), 00483 sizeof(ClusterId)); 00484 } 00485 int cell_global_index = point_in_cells_count_.GetCellGlobalIndex(quantization); 00486 point_in_cells_count_.table[cell_global_index] += 1; 00487 } 00488 } 00489 00490 template<class Record> 00491 void MultiIndexer<Record>::BuildMultiIndex(const string& points_filename, 00492 const string& metainfo_filename, 00493 const int points_count, 00494 const vector<Centroids>& coarse_vocabs, 00495 const vector<Centroids>& fine_vocabs, 00496 const RerankMode& mode, 00497 const bool build_coarse_quantization, 00498 const string& files_prefix, 00499 const string& coarse_quantization_filename) { 00500 InitParameters<Record>(fine_vocabs, mode, metainfo_filename); 00501 InitBlasStructures(coarse_vocabs); 00502 files_prefix_ = files_prefix; 00503 coarse_quantization_filename_ = coarse_quantization_filename; 00504 if(build_coarse_quantization) { 00505 PrepareCoarseQuantization(points_filename, points_count, coarse_vocabs); 00506 } else { 00507 RestorePointsInCellsCountFromCourseQuantization(points_filename, 00508 points_count, 00509 coarse_vocabs); 00510 } 00511 FillMultiIndex(points_filename, points_count, coarse_vocabs, fine_vocabs, mode); 00512 cout << "Multiindex created" << endl; 00513 SerializeMultiIndexFiles(); 00514 cout << "Multiindex serialized" << endl; 00515 } 00516 00517 template<class Record> 00518 void MultiIndexer<Record>::InitBlasStructures(const vector<Centroids>& coarse_vocabs) { 00519 coarse_vocabs_matrices_.resize(coarse_vocabs.size()); 00520 coarse_centroids_norms_.resize(coarse_vocabs.size(), vector<float>(coarse_vocabs[0].size())); 00521 for(int coarse_id = 0; coarse_id < coarse_vocabs_matrices_.size(); ++coarse_id) { 00522 coarse_vocabs_matrices_[coarse_id] = new float[coarse_vocabs[0].size() * coarse_vocabs[0][0].size()]; 00523 for(int i = 0; i < coarse_vocabs[0].size(); ++i) { 00524 Coord norm = 0; 00525 for(int j = 0; j < coarse_vocabs[0][0].size(); ++j) { 00526 coarse_vocabs_matrices_[coarse_id][coarse_vocabs[0][0].size() * i + j] = coarse_vocabs[coarse_id][i][j]; 00527 norm += coarse_vocabs[coarse_id][i][j] * coarse_vocabs[coarse_id][i][j]; 00528 } 00529 coarse_centroids_norms_[coarse_id][i] = norm; 00530 } 00531 } 00532 } 00533 00534 template<> 00535 inline void GetRecord<PointId> (const Point& point, const PointId pid, 00536 const vector<ClusterId> coarse_quantization, 00537 const vector<Centroids>& coarse_vocabs, 00538 PointId* result) { 00539 *result = pid; 00540 } 00541 00542 inline void FillAdcInfo(const Point& point, const PointId pid, 00543 const vector<Centroids>& fine_vocabs, 00544 char* result) { 00545 int subvectors_count = fine_vocabs.size(); 00546 int subvector_dim = point.size() / subvectors_count; 00547 for(int subvector_index = 0; subvector_index < subvectors_count; ++subvector_index) { 00548 Dimensions start_dim = subvector_index * subvector_dim; 00549 Dimensions final_dim = start_dim + subvector_dim; 00550 *((FineClusterId*)result) = (FineClusterId)GetNearestClusterId(point, fine_vocabs[subvector_index], 00551 start_dim, final_dim); 00552 result += sizeof(FineClusterId); 00553 } 00554 } 00555 00556 template<> 00557 inline void GetRecord<RerankADC8> (const Point& point, const PointId pid, 00558 const vector<ClusterId> coarse_quantization, 00559 const vector<Centroids>& coarse_vocabs, 00560 RerankADC8* result) { 00561 result->pid = pid; 00562 char* rerank_info_ptr = (char*)result + sizeof(pid); 00563 if(gConfig.rerank_mode == USE_RESIDUALS) { 00564 Point residual; 00565 GetResidual(point, coarse_quantization, coarse_vocabs, &residual); 00566 FillAdcInfo(residual, pid, gConfig.fine_vocabs, rerank_info_ptr); 00567 } else if (gConfig.rerank_mode == USE_INIT_POINTS) { 00568 FillAdcInfo(point, pid, gConfig.fine_vocabs, rerank_info_ptr); 00569 } 00570 } 00571 00572 template<> 00573 inline void GetRecord<RerankADC16> (const Point& point, const PointId pid, 00574 const vector<ClusterId> coarse_quantization, 00575 const vector<Centroids>& coarse_vocabs, 00576 RerankADC16* result) { 00577 result->pid = pid; 00578 char* rerank_info_ptr = (char*)result + sizeof(pid); 00579 if(gConfig.rerank_mode == USE_RESIDUALS) { 00580 Point residual; 00581 GetResidual(point, coarse_quantization, coarse_vocabs, &residual); 00582 FillAdcInfo(residual, pid, gConfig.fine_vocabs, rerank_info_ptr); 00583 } else if (gConfig.rerank_mode == USE_INIT_POINTS) { 00584 FillAdcInfo(point, pid, gConfig.fine_vocabs, rerank_info_ptr); 00585 } 00586 } 00587 00588 #endif 00589 00590 00591 00592