MultiIndex
indexer.h
Go to the documentation of this file.
00001 
00003 // Copyright 2012 Yandex Artem Babenko
00004 #ifndef INDEXER_H_
00005 #define INDEXER_H_
00006 
00007 #include <ctime>
00008 #include <map>
00009 
00010 #include <boost/archive/binary_iarchive.hpp>
00011 #include <boost/archive/binary_oarchive.hpp>
00012 
00013 #include <boost/algorithm/string/split.hpp>
00014 #include <boost/algorithm/string.hpp>
00015 
00016 #include <boost/lexical_cast.hpp>
00017 
00018 #include <boost/serialization/serialization.hpp>
00019 #include <boost/serialization/set.hpp>
00020 #include <boost/serialization/vector.hpp>
00021 
00022 #include "data_util.h"
00023 #include "multitable.hpp"
00024 
00025 
00026 using std::ifstream;
00027 using std::map;
00028 using std::multimap;
00029 using std::ofstream;
00030 using std::string;
00031 
00032 using boost::lexical_cast;
00033 using boost::split;
00034 
00035 extern int THREADS_COUNT;
00036 
00037 extern Dimensions SPACE_DIMENSION;
00038 
00039 extern enum PointType point_type;
00040 
00041 IndexConfig gConfig;
00042 
00048 template<class Record>
00049 class MultiIndexer {
00050  public:
00055   MultiIndexer(const int multiplicity = 2);
00067   void BuildMultiIndex(const string& points_filename,
00068                        const string& metainfo_filename,
00069                        const int points_count,
00070                        const vector<Centroids>& coarse_vocabs,
00071                        const vector<Centroids>& fine_vocabs,
00072                        const RerankMode& mode,
00073                        const bool build_coarse_quantization,
00074                        const string& files_prefix,
00075                        const string& coarse_quantization_filename = "");
00076  private:
00083   void PrepareCoarseQuantization(const string& points_filename,
00084                                  const int points_count,
00085                                  const vector<Centroids>& coarse_vocabs);
00094   void GetCoarseQuantizationsForSubset(const string& points_filename,
00095                                        const int start_pid,
00096                                        const int subset_size,
00097                                        const vector<Centroids>& coarse_vocabs,
00098                                        vector<vector<ClusterId> >*
00099                                        transposed_coarse_quantizations);
00106   void SerializeCoarseQuantizations(const vector<vector<ClusterId> >&
00107                                     transposed_coarse_quantizations,
00108                                     const string& filename);
00113   void SerializeMultiIndexFiles();
00117   void ConvertPointsInCellsCountToCellEdges();
00118 
00127   void FillMultiIndex(const string& points_filename,
00128                       const int points_count,
00129                       const vector<Centroids>& coarse_vocabs,
00130                       const vector<Centroids>& fine_vocabs,
00131                       const RerankMode& mode);
00142   void FillMultiIndexForSubset(const string& points_filename,
00143                                const PointId start_pid,
00144                                const int points_count,
00145                                const vector<Centroids>& coarse_vocabs,
00146                                const vector<Centroids>& fine_vocabs,
00147                                const RerankMode& mode,
00148                                Multitable<int>* points_written_in_index);
00149 
00156   void GetPointCoarseQuantization(const PointId pid,
00157                                   const string& filename,
00158                                   vector<ClusterId>* coarse_quantization);
00165   void FillPointRerankInfo(const Point& point,
00166                            const PointId pid,
00167                            const vector<Centroids>& fine_vocabs);
00175   void RestorePointsInCellsCountFromCourseQuantization(const string& points_filename,
00176                                                        const int points_count,
00177                                                        const vector<Centroids>& coarse_vocabs);
00181   int GetInputCoordSizeof();
00187   void ReadPoint(ifstream& input, Point* point);
00192   void InitBlasStructures(const vector<Centroids>& coarse_vocabs);      
00196   string files_prefix_;
00200   string coarse_quantization_filename_;
00204   int multiplicity_;
00208   Multitable<int> point_in_cells_count_;
00212   MultiIndex<Record> multiindex_;
00216   boost::mutex cell_counts_mutex_;
00220   vector<float*> coarse_vocabs_matrices_;
00224   vector<vector<float> > coarse_centroids_norms_;
00225 };
00226 
00227 template<class Record>
00228 inline void GetRecord(const Point& point, const PointId pid,
00229                       const vector<ClusterId> coarse_quantization,
00230                       const vector<Centroids>& coarse_vocabs,
00231                       Record* result) {
00232 }
00233 
00234 template<class Record>
00235 void InitParameters(const vector<Centroids>& fine_vocabs,
00236                     const RerankMode& mode,
00237                     const string& metainfo_filename) {
00238   gConfig.fine_vocabs = fine_vocabs;
00239   gConfig.rerank_mode = mode;
00240 }
00241 
00242 
00244 template<class Record>
00245 MultiIndexer<Record>::MultiIndexer(const int multiplicity) {
00246   if(multiplicity < 0) {
00247     throw std::logic_error("Multiplicity < 0");
00248   }
00249   multiplicity_ = multiplicity;
00250 }
00251 
00252 template<class Record>
00253 int MultiIndexer<Record>::GetInputCoordSizeof() {
00254   if(point_type == FVEC) {
00255     return (int)sizeof(float);
00256   } else if(point_type == BVEC) {
00257     return (int)sizeof(unsigned char);
00258   }
00259 }
00260 
00261 template<class Record>
00262 void MultiIndexer<Record>::ReadPoint(ifstream& input, Point* point) {
00263   if(!input.good()) {
00264     throw std::logic_error("Bad input stream");
00265   }
00266   if(point_type == FVEC) {
00267     ReadVector<float, Coord>(input, point);
00268   } else if(point_type == BVEC) {
00269     ReadVector<unsigned char, Coord>(input, point);
00270   }    
00271 }
00272 
00273 template<class Record>
00274 void MultiIndexer<Record>::SerializeCoarseQuantizations(const vector<vector<ClusterId> >&
00275                                                                           transposed_coarse_quantizations,
00276                                                         const string& filename) {
00277   ofstream quantizations_stream;
00278   quantizations_stream.open(filename.c_str(), ios::binary);
00279   if(!quantizations_stream.good()) {
00280     throw std::logic_error("Bad input stream");
00281   }
00282   cout << "Writing coarse quantizations started" << endl;
00283   for(PointId pid = 0; pid < transposed_coarse_quantizations[0].size(); ++pid) {
00284     for(int subspace_index = 0; subspace_index < multiplicity_; ++subspace_index) {
00285       ClusterId quantization = transposed_coarse_quantizations[subspace_index][pid];
00286       quantizations_stream.write((char*)&quantization, sizeof(quantization));
00287     }
00288   }
00289   quantizations_stream.close();
00290   cout << "Writing coarse quantizations started" << endl;
00291 }
00292 
00293 template<class Record>
00294 void MultiIndexer<Record>::SerializeMultiIndexFiles() {
00295   cout << "Start multiindex serializing....\n";
00296   ofstream cell_edges(string(files_prefix_ + "_cell_edges.bin").c_str(), ios::binary);
00297   boost::archive::binary_oarchive arc_cell_edges(cell_edges);
00298   arc_cell_edges << multiindex_.cell_edges;
00299   ofstream multi_array(string(files_prefix_ + "_multi_array.bin").c_str(), ios::binary);
00300   boost::archive::binary_oarchive arc_multi_array(multi_array);
00301   arc_multi_array << multiindex_.multiindex;
00302   cout << "Finish multiindex serializing....\n";
00303 }
00304 
00305 template<class Record>
00306 void MultiIndexer<Record>::GetCoarseQuantizationsForSubset(const string& points_filename,
00307                                                            const int start_pid,
00308                                                            const int subset_size,
00309                                                            const vector<Centroids>& coarse_vocabs,
00310                                                            vector<vector<ClusterId> >*
00311                                                                   transposed_coarse_quantizations) {
00312   ifstream point_stream;
00313   point_stream.open(points_filename.c_str(), ios::binary);
00314   if(!point_stream.good()) {
00315     throw std::logic_error("Bad input points stream");
00316   }
00317   // we assume points are stored in .fvecs or .bvecs format
00318   point_stream.seekg(start_pid * (GetInputCoordSizeof() * SPACE_DIMENSION + sizeof(Dimensions)), ios::beg);
00319   vector<ClusterId> coarse_quantization(multiplicity_);
00320   for(int point_number = 0; point_number < subset_size; ++point_number) {
00321     if(point_number % 10000 == 0) {
00322       cout << "Getting coarse quantization, point # " << start_pid + point_number << endl;
00323     }
00324     Point current_point;
00325     ReadPoint(point_stream, &current_point);
00326     int subpoints_dimension = SPACE_DIMENSION / multiplicity_;
00327     for(int coarse_index = 0; coarse_index < multiplicity_; ++coarse_index) {
00328       Dimensions start_dim = coarse_index * subpoints_dimension;
00329       Dimensions final_dim = start_dim + subpoints_dimension;
00330       ClusterId nearest = GetNearestClusterId(current_point, coarse_vocabs.at(coarse_index),
00331                                               start_dim, final_dim);
00332       transposed_coarse_quantizations->at(coarse_index)[start_pid + point_number] = nearest;
00333       coarse_quantization[coarse_index] = nearest;
00334     }
00335     int global_index = point_in_cells_count_.GetCellGlobalIndex(coarse_quantization);
00336     cell_counts_mutex_.lock();
00337     ++(point_in_cells_count_.table[global_index]);
00338     cell_counts_mutex_.unlock();
00339   }
00340 }
00341 
00342 template<class Record>
00343 void MultiIndexer<Record>::PrepareCoarseQuantization(const string& points_filename,
00344                                                      const int points_count,
00345                                                      const vector<Centroids>& coarse_vocabs) {
00346   // we use transposed quantizations for efficient memory usage
00347   vector<vector<ClusterId> > transposed_coarse_quantizations; 
00348   transposed_coarse_quantizations.resize(multiplicity_);
00349   vector<int> multiindex_table_dimensions;
00350   for(int i = 0; i < multiplicity_; ++i) {
00351     transposed_coarse_quantizations[i].resize(points_count);
00352     multiindex_table_dimensions.push_back(coarse_vocabs[i].size());
00353   }
00354   point_in_cells_count_.Resize(multiindex_table_dimensions);
00355   cout << "Memory for coarse quantizations allocated" << endl;
00356   boost::thread_group index_threads;
00357   int thread_points_count = points_count / THREADS_COUNT;
00358   for(int thread_id = 0; thread_id < THREADS_COUNT; ++thread_id) {
00359     PointId start_pid = thread_points_count * thread_id;
00360     index_threads.create_thread(boost::bind(&MultiIndexer::GetCoarseQuantizationsForSubset,
00361                                             this, points_filename, start_pid, thread_points_count,
00362                                             boost::cref(coarse_vocabs), &transposed_coarse_quantizations));
00363   }
00364   index_threads.join_all();
00365   if(coarse_quantization_filename_.empty()) {
00366     coarse_quantization_filename_ = files_prefix_ + "_coarse_quantizations.bin";
00367   }
00368   cout << "Coarse quantizations are calculated" << endl;
00369   SerializeCoarseQuantizations(transposed_coarse_quantizations, coarse_quantization_filename_);
00370   cout << "Coarse quantizations are serialized" << endl;
00371 }
00372 
00373 template<class Record>
00374 void MultiIndexer<Record>::ConvertPointsInCellsCountToCellEdges() {
00375   cout << "Converting points in cells to cell edges...\n";
00376   multiindex_.cell_edges = point_in_cells_count_;
00377   multiindex_.cell_edges.table[0] = 0;
00378   for(int global_index = 1;
00379       global_index < point_in_cells_count_.table.size();
00380       ++global_index) {
00381     multiindex_.cell_edges.table[global_index] = multiindex_.cell_edges.table[global_index - 1] +
00382                                                  point_in_cells_count_.table[global_index - 1];
00383   }
00384   // we do not need this table more
00385   point_in_cells_count_.table.clear();
00386   cout << "Finish converting points in cells to cell edges...\n";
00387 }
00388 
00389 template<class Record>
00390 void MultiIndexer<Record>::GetPointCoarseQuantization(const PointId pid,
00391                                                       const string& filename,
00392                                                       vector<ClusterId>* coarse_quantization) {
00393   ifstream coarse_quantization_stream;
00394   coarse_quantization_stream.open(filename.c_str(), ios::binary);
00395   if(!coarse_quantization_stream.good()) {
00396     throw std::logic_error("Bad input coarse quantizations stream");
00397   }
00398   coarse_quantization_stream.seekg((long long)pid * sizeof(ClusterId) * multiplicity_, ios::beg);
00399   for(int coarse_index = 0; coarse_index < multiplicity_; ++coarse_index) {
00400     coarse_quantization_stream.read((char*)&(coarse_quantization->at(coarse_index)),
00401                                     sizeof(coarse_quantization->at(coarse_index)));
00402   }
00403 }
00404 
00405 template<class Record>
00406 void MultiIndexer<Record>::FillMultiIndexForSubset(const string& points_filename,
00407                                                    const PointId start_pid,
00408                                                    const int points_count,
00409                                                    const vector<Centroids>& coarse_vocabs,
00410                                                    const vector<Centroids>& fine_vocabs,
00411                                                    const RerankMode& mode,
00412                                                    Multitable<int>* points_written_in_index) {
00413   ifstream point_stream;
00414   point_stream.open(points_filename.c_str(), ios::binary);
00415   if(!point_stream.good()) {
00416     throw std::logic_error("Bad input points stream");
00417   }
00418   point_stream.seekg((long long)start_pid * (GetInputCoordSizeof() * SPACE_DIMENSION + sizeof(Dimensions)), ios::beg);
00419   for(int point_number = 0; point_number < points_count; ++point_number) {
00420     if(point_number % 10000 == 0) {
00421       cout << "Filling multiindex, point # " << start_pid + point_number << endl;
00422     }
00423   Point current_point;
00424   ReadPoint(point_stream, &current_point);
00425   vector<ClusterId> coarse_quantization(multiplicity_);
00426   GetPointCoarseQuantization(start_pid + point_number,
00427                              coarse_quantization_filename_,
00428                              &coarse_quantization);
00429   int current_written_count = points_written_in_index->GetValue(coarse_quantization);
00430   int pid_multiindex = multiindex_.cell_edges.GetValue(coarse_quantization) + current_written_count;
00431   GetRecord<Record>(current_point, start_pid + point_number,
00432                     coarse_quantization, coarse_vocabs, &(multiindex_.multiindex[pid_multiindex]));
00433   cell_counts_mutex_.lock();
00434   points_written_in_index->SetValue(current_written_count + 1, coarse_quantization);
00435   cell_counts_mutex_.unlock();
00436   }
00437 }
00438 
00439 template<class Record>
00440 void MultiIndexer<Record>::FillMultiIndex(const string& points_filename,
00441                                           const int points_count,
00442                                           const vector<Centroids>& coarse_vocabs,
00443                                           const vector<Centroids>& fine_vocabs,
00444                                           const RerankMode& mode) {
00445   ConvertPointsInCellsCountToCellEdges();
00446   multiindex_.multiindex.resize(points_count);
00447   cout << "Indexing started..." << endl;
00448 
00449   Multitable<int> points_written_in_index(multiindex_.cell_edges.dimensions);
00450   int thread_points_count = points_count / THREADS_COUNT;
00451   boost::thread_group threads;
00452   for(int thread_id = 0; thread_id < THREADS_COUNT; ++thread_id) {
00453     PointId start_pid = thread_points_count * thread_id;
00454     threads.create_thread(boost::bind(&MultiIndexer::FillMultiIndexForSubset, this, points_filename, start_pid,
00455                                       thread_points_count, boost::cref(coarse_vocabs),
00456                                       boost::cref(fine_vocabs), mode, &points_written_in_index));
00457   }
00458   threads.join_all();
00459   cout << "Indexing finished..." << endl;
00460 }
00461 
00462 template<class Record>
00463 void MultiIndexer<Record>::RestorePointsInCellsCountFromCourseQuantization(const string& points_filename,
00464                                                                            const int points_count,
00465                                                                            const vector<Centroids>& coarse_vocabs) {
00466   vector<int> dimensions;
00467   for(int i = 0; i < multiplicity_; ++i) {
00468     dimensions.push_back(coarse_vocabs[i].size());
00469   }
00470   point_in_cells_count_.Resize(dimensions);
00471   ifstream coarse_quantization_stream;
00472   coarse_quantization_stream.open(coarse_quantization_filename_.c_str(), ios::binary);
00473   if(!coarse_quantization_stream.good()) {
00474     throw std::logic_error("Bad input coarse quantizations stream");
00475   }
00476   CoarseQuantization quantization(multiplicity_);
00477   for(PointId pid = 0; pid < points_count; ++pid) {
00478     if(pid % 100000 == 0) {
00479       cout << pid << endl;
00480     }
00481     for(int subspace_index = 0; subspace_index < multiplicity_; ++subspace_index) {
00482       coarse_quantization_stream.read((char*)&(quantization[subspace_index]), 
00483                                       sizeof(ClusterId));
00484     }
00485     int cell_global_index = point_in_cells_count_.GetCellGlobalIndex(quantization);
00486     point_in_cells_count_.table[cell_global_index] += 1;
00487   }
00488 }
00489 
00490 template<class Record>
00491 void MultiIndexer<Record>::BuildMultiIndex(const string& points_filename,
00492                                            const string& metainfo_filename,
00493                                            const int points_count,
00494                                            const vector<Centroids>& coarse_vocabs,
00495                                            const vector<Centroids>& fine_vocabs,
00496                                            const RerankMode& mode,
00497                                            const bool build_coarse_quantization,
00498                                            const string& files_prefix,
00499                                            const string& coarse_quantization_filename) {
00500   InitParameters<Record>(fine_vocabs, mode, metainfo_filename);
00501   InitBlasStructures(coarse_vocabs);
00502   files_prefix_ = files_prefix;
00503   coarse_quantization_filename_ = coarse_quantization_filename;
00504   if(build_coarse_quantization) {
00505     PrepareCoarseQuantization(points_filename, points_count, coarse_vocabs);
00506   } else {
00507   RestorePointsInCellsCountFromCourseQuantization(points_filename,
00508                                                   points_count,
00509                                                   coarse_vocabs);
00510   }
00511   FillMultiIndex(points_filename, points_count, coarse_vocabs, fine_vocabs, mode);
00512   cout << "Multiindex created" << endl;
00513   SerializeMultiIndexFiles();
00514   cout << "Multiindex serialized" << endl;
00515 }
00516 
00517 template<class Record>
00518 void MultiIndexer<Record>::InitBlasStructures(const vector<Centroids>& coarse_vocabs) {
00519   coarse_vocabs_matrices_.resize(coarse_vocabs.size());
00520   coarse_centroids_norms_.resize(coarse_vocabs.size(), vector<float>(coarse_vocabs[0].size()));
00521   for(int coarse_id = 0; coarse_id < coarse_vocabs_matrices_.size(); ++coarse_id) {
00522     coarse_vocabs_matrices_[coarse_id] = new float[coarse_vocabs[0].size() * coarse_vocabs[0][0].size()];
00523     for(int i = 0; i < coarse_vocabs[0].size(); ++i) {
00524       Coord norm = 0;
00525       for(int j = 0; j < coarse_vocabs[0][0].size(); ++j) {
00526         coarse_vocabs_matrices_[coarse_id][coarse_vocabs[0][0].size() * i + j] = coarse_vocabs[coarse_id][i][j];
00527         norm += coarse_vocabs[coarse_id][i][j] * coarse_vocabs[coarse_id][i][j];
00528       }
00529       coarse_centroids_norms_[coarse_id][i] = norm;
00530     }
00531   }
00532 }
00533 
00534 template<>
00535 inline void GetRecord<PointId> (const Point& point, const PointId pid,
00536                                 const vector<ClusterId> coarse_quantization,
00537                                 const vector<Centroids>& coarse_vocabs,
00538                                 PointId* result) {
00539   *result = pid;
00540 }
00541 
00542 inline void FillAdcInfo(const Point& point, const PointId pid,
00543                         const vector<Centroids>& fine_vocabs,
00544                         char* result) {
00545   int subvectors_count = fine_vocabs.size();
00546   int subvector_dim = point.size() / subvectors_count;
00547   for(int subvector_index = 0; subvector_index < subvectors_count; ++subvector_index) {
00548     Dimensions start_dim = subvector_index * subvector_dim;
00549     Dimensions final_dim = start_dim + subvector_dim;
00550     *((FineClusterId*)result) = (FineClusterId)GetNearestClusterId(point, fine_vocabs[subvector_index],
00551                                                                                start_dim, final_dim);
00552     result += sizeof(FineClusterId);
00553   }
00554 }
00555 
00556 template<>
00557 inline void GetRecord<RerankADC8> (const Point& point, const PointId pid,
00558                                    const vector<ClusterId> coarse_quantization,
00559                                    const vector<Centroids>& coarse_vocabs,
00560                                    RerankADC8* result) {
00561   result->pid = pid;
00562   char* rerank_info_ptr = (char*)result + sizeof(pid);
00563   if(gConfig.rerank_mode == USE_RESIDUALS) {
00564     Point residual;
00565     GetResidual(point, coarse_quantization, coarse_vocabs, &residual);
00566     FillAdcInfo(residual, pid, gConfig.fine_vocabs, rerank_info_ptr);
00567   } else if (gConfig.rerank_mode == USE_INIT_POINTS) {
00568     FillAdcInfo(point, pid, gConfig.fine_vocabs, rerank_info_ptr);
00569   }
00570 }
00571 
00572 template<>
00573 inline void GetRecord<RerankADC16> (const Point& point, const PointId pid,
00574                                     const vector<ClusterId> coarse_quantization,
00575                                     const vector<Centroids>& coarse_vocabs,
00576                                     RerankADC16* result) {
00577   result->pid = pid;
00578   char* rerank_info_ptr = (char*)result + sizeof(pid);
00579   if(gConfig.rerank_mode == USE_RESIDUALS) {
00580     Point residual;
00581     GetResidual(point, coarse_quantization, coarse_vocabs, &residual);
00582     FillAdcInfo(residual, pid, gConfig.fine_vocabs, rerank_info_ptr);
00583   } else if (gConfig.rerank_mode == USE_INIT_POINTS) {
00584     FillAdcInfo(point, pid, gConfig.fine_vocabs, rerank_info_ptr);
00585   }
00586 }
00587 
00588 #endif
00589 
00590 
00591 
00592 
 All Classes Files Functions Variables Typedefs Enumerations Enumerator