mlpack  master
dataset_mapper.hpp
Go to the documentation of this file.
1 
15 #ifndef MLPACK_CORE_DATA_DATASET_INFO_HPP
16 #define MLPACK_CORE_DATA_DATASET_INFO_HPP
17 
18 #include <mlpack/prereqs.hpp>
19 #include <unordered_map>
20 #include <boost/bimap.hpp>
21 
23 
24 namespace mlpack {
25 namespace data {
35 template <typename PolicyType>
37 {
38  public:
44  explicit DatasetMapper(const size_t dimensionality = 0);
45 
51  explicit DatasetMapper(PolicyType& policy, const size_t dimensionality = 0);
52 
62  typename PolicyType::MappedType MapString(const std::string& string,
63  const size_t dimension);
64 
73  const std::string& UnmapString(const size_t value, const size_t dimension);
74 
75 
84  typename PolicyType::MappedType UnmapValue(const std::string& string,
85  const size_t dimension);
86 
99  template <typename eT>
100  void MapTokens(const std::vector<std::string>& tokens, size_t& row,
101  arma::Mat<eT>& matrix);
102 
104  Datatype Type(const size_t dimension) const;
106  Datatype& Type(const size_t dimension);
107 
112  size_t NumMappings(const size_t dimension) const;
113 
120  size_t Dimensionality() const;
121 
125  template<typename Archive>
126  void Serialize(Archive& ar, const unsigned int /* version */)
127  {
128  ar & data::CreateNVP(types, "types");
129  ar & data::CreateNVP(maps, "maps");
130  }
131 
133  const PolicyType& Policy() const;
134 
136  PolicyType& Policy();
137 
139  void Policy(PolicyType&& policy);
140 
141  private:
143  std::vector<Datatype> types;
144 
145  // BiMapType definition
146  using BiMapType = boost::bimap<std::string, typename PolicyType::MappedType>;
147 
148  // Mappings from strings to integers.
149  // Map entries will only exist for dimensions that are categorical.
150  // MapType = map<dimension, pair<bimap<string, MappedType>, numMappings>>
151  using MapType = std::unordered_map<size_t, std::pair<BiMapType, size_t>>;
152 
155 
157  // mapped to the maps object. It is used in MapString() and MapTokens().
158  PolicyType policy;
159 };
160 
161 // Use typedef to provide backward compatibility
163 
164 } // namespace data
165 } // namespace mlpack
166 
167 #include "dataset_mapper_impl.hpp"
168 
169 #endif
Auxiliary information for a dataset, including mappings to/from strings and the datatype of each dime...
PolicyType policy
policy object tells dataset mapper how the categorical values should be
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: binarize.hpp:18
Datatype
The Datatype enum specifies the types of data mlpack algorithms can use.
Definition: datatype.hpp:24
Datatype Type(const size_t dimension) const
Return the type of a given dimension (numeric or categorical).
std::unordered_map< size_t, std::pair< BiMapType, size_t >> MapType
The core includes that mlpack expects; standard C++ includes and Armadillo.
FirstShim< T > CreateNVP(T &t, const std::string &name, typename std::enable_if_t< HasSerialize< T >::value > *=0)
Call this function to produce a name-value pair; this is similar to BOOST_SERIALIZATION_NVP(), but should be used for types that have a Serialize() function (or contain a type that has a Serialize() function) instead of a serialize() function.
size_t Dimensionality() const
Get the dimensionality of the DatasetMapper object (that is, how many dimensions it has information f...
DatasetMapper(const size_t dimensionality=0)
Create the DatasetMapper object with the given dimensionality.
MapType maps
maps object stores string and numerical pairs.
std::vector< Datatype > types
Types of each dimension.
PolicyType::MappedType MapString(const std::string &string, const size_t dimension)
Given the string and the dimension to which it belongs, return its numeric mapping.
void MapTokens(const std::vector< std::string > &tokens, size_t &row, arma::Mat< eT > &matrix)
MapTokens turns vector of strings into numeric variables and puts them into a given matrix...
size_t NumMappings(const size_t dimension) const
Get the number of mappings for a particular dimension.
const PolicyType & Policy() const
Return the policy of the mapper.
boost::bimap< std::string, typename PolicyType::MappedType > BiMapType
const std::string & UnmapString(const size_t value, const size_t dimension)
Return the string that corresponds to a given value in a given dimension.
test cpp RESULT_VARIABLE MEX_RESULT_TRASH OUTPUT_VARIABLE MEX_OUTPUT ERROR_VARIABLE MEX_ERROR_TRASH string(REGEX MATCH"Warning: You are using"MEX_WARNING"${MEX_OUTPUT}") if(MEX_WARNING) string(REGEX REPLACE".*using [a-zA-Z]* version \"([0-9.]*)[^\"]*\".*""\\1"OTHER_COMPILER_VERSION"$
Definition: CMakeLists.txt:18
void Serialize(Archive &ar, const unsigned int)
Serialize the dataset information.
PolicyType::MappedType UnmapValue(const std::string &string, const size_t dimension)
Return the value that corresponds to a given string in a given dimension.