Alexandria  2.18
Please provide a description of the project.
NpyCommon.h
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2012-2021 Euclid Science Ground Segment
3  *
4  * This library is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU Lesser General Public License as published by the Free
6  * Software Foundation; either version 3.0 of the License, or (at your option)
7  * any later version.
8  *
9  * This library is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU Lesser General Public License
15  * along with this library; if not, write to the Free Software Foundation, Inc.,
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 #ifndef ALEXANDRIA_NDARRAY_IMPL_NPYCOMMON_H
20 #define ALEXANDRIA_NDARRAY_IMPL_NPYCOMMON_H
21 
23 #include <boost/endian/arithmetic.hpp>
24 #include <boost/filesystem/operations.hpp>
25 #include <boost/iostreams/device/mapped_file.hpp>
26 #include <boost/regex.hpp>
27 
28 namespace Euclid {
29 namespace NdArray {
30 
31 using boost::endian::little_uint16_t;
32 using boost::endian::little_uint32_t;
33 
37 constexpr const char NPY_MAGIC[] = {'\x93', 'N', 'U', 'M', 'P', 'Y'};
38 
42 #if BYTE_ORDER == LITTLE_ENDIAN
43 constexpr const char* ENDIAN_MARKER = "<";
44 #elif BYTE_ORDER == BIG_ENDIAN
45 constexpr const char* ENDIAN_MARKER = ">";
46 #else
47 #error "PDP_ENDIAN not supported"
48 #endif
49 
53 template <typename T>
54 struct NpyDtype {};
55 
56 template <>
57 struct NpyDtype<int8_t> {
58  static constexpr const char* str = "b";
59 };
60 
61 template <>
62 struct NpyDtype<int16_t> {
63  static constexpr const char* str = "i2";
64 };
65 
66 template <>
67 struct NpyDtype<int32_t> {
68  static constexpr const char* str = "i4";
69 };
70 
71 template <>
72 struct NpyDtype<int64_t> {
73  static constexpr const char* str = "i8";
74 };
75 
76 template <>
77 struct NpyDtype<uint8_t> {
78  static constexpr const char* str = "B";
79 };
80 
81 template <>
82 struct NpyDtype<uint16_t> {
83  static constexpr const char* str = "u2";
84 };
85 
86 template <>
87 struct NpyDtype<uint32_t> {
88  static constexpr const char* str = "u4";
89 };
90 
91 template <>
92 struct NpyDtype<uint64_t> {
93  static constexpr const char* str = "u8";
94 };
95 
96 template <>
97 struct NpyDtype<float> {
98  static constexpr const char* str = "f4";
99 };
100 
101 template <>
102 struct NpyDtype<double> {
103  static constexpr const char* str = "f8";
104 };
105 
109 inline void parseSingleValue(const std::string& descr, bool& big_endian, std::string& dtype) {
110  big_endian = (descr.front() == '>');
111  dtype = descr.substr(1);
112 }
113 
122 inline void parseFieldValues(const std::string& descr, bool& big_endian, std::vector<std::string>& attrs, std::string& dtype) {
123  static const boost::regex field_expr("\\('([^']*)',\\s*'([^']*)'\\)");
124 
125  boost::match_results<std::string::const_iterator> match;
126  auto start = descr.begin();
127  auto end = descr.end();
128 
129  while (boost::regex_search(start, end, match, field_expr)) {
130  bool endian_aux;
131  std::string dtype_aux;
132 
133  parseSingleValue(match[2].str(), endian_aux, dtype_aux);
134  if (dtype.empty()) {
135  dtype = dtype_aux;
136  big_endian = endian_aux;
137  } else if (dtype != dtype_aux || big_endian != endian_aux) {
138  throw std::invalid_argument("NdArray only supports uniform types");
139  }
140  attrs.emplace_back(match[1].str());
141 
142  start = match[0].second;
143  }
144 }
145 
163 inline void parseNpyDict(const std::string& header, bool& fortran_order, bool& big_endian, std::string& dtype,
164  std::vector<size_t>& shape, std::vector<std::string>& attrs, size_t& n_elements) {
165  auto loc = header.find("fortran_order") + 16;
166  fortran_order = (header.substr(loc, 4) == "True");
167 
168  loc = header.find("descr") + 8;
169 
170  if (header[loc] == '\'') {
171  auto end = header.find('\'', loc + 1);
172  parseSingleValue(header.substr(loc + 1, end - loc - 1), big_endian, dtype);
173  } else if (header[loc] == '[') {
174  auto end = header.find(']', loc + 1);
175  parseFieldValues(header.substr(loc + 1, end - loc - 1), big_endian, attrs, dtype);
176  } else {
177  throw Elements::Exception() << "Failed to parse the array description: " << header;
178  }
179 
180  loc = header.find("shape") + 9;
181  auto loc2 = header.find(')', loc);
182  auto shape_str = header.substr(loc, loc2 - loc);
183  if (shape_str.back() == ',')
184  shape_str.resize(shape_str.size() - 1);
185  shape = stringToVector<size_t>(shape_str);
186  n_elements = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_t>());
187 }
188 
204  size_t& n_elements) {
205  // Magic
206  char magic[6];
207  input.read(magic, sizeof(magic));
208  if (std::memcmp(magic, NPY_MAGIC, sizeof(NPY_MAGIC)) != 0) {
209  throw Elements::Exception() << "Unexpected magic sequence";
210  }
211 
212  // Version and header len
213  little_uint32_t header_len;
214  little_uint16_t version;
215  input.read(reinterpret_cast<char*>(&version), sizeof(version));
216  if (version > 30) {
217  throw Elements::Exception() << "Only numpy arrays with version 3 or less are supported";
218  } else if (version.data()[0] == 1) {
219  // 16 bits integer in little endian
220  little_uint16_t aux;
221  input.read(reinterpret_cast<char*>(&aux), sizeof(aux));
222  header_len = aux;
223  } else {
224  // 32 bits integer in little endian
225  input.read(reinterpret_cast<char*>(&header_len), sizeof(header_len));
226  }
227 
228  // Read header
229  std::string header(header_len, '\0');
230  input.read(&header[0], header_len);
231 
232  // Parse header
233  bool fortran_order, big_endian;
234  parseNpyDict(header, fortran_order, big_endian, dtype, shape, attrs, n_elements);
235 
236  if (fortran_order)
237  throw Elements::Exception() << "Fortran order not supported";
238 
239  if ((big_endian && (BYTE_ORDER != BIG_ENDIAN)) || (!big_endian && (BYTE_ORDER != LITTLE_ENDIAN)))
240  throw Elements::Exception() << "Only native endianness supported for reading";
241 }
242 
246 constexpr const uint8_t NPY_VERSION[] = {'\x02', '\x00'};
247 
254  std::stringstream shape_stream;
255  shape_stream << "(";
256  for (auto s : shape) {
257  shape_stream << s << ',';
258  }
259  shape_stream << ")";
260  return shape_stream.str();
261 }
262 
264  std::stringstream dtype;
265  if (attrs.empty()) {
266  dtype << '\'' << ENDIAN_MARKER << type << '\'';
267  } else {
268  dtype << '[';
269  for (auto& attr : attrs) {
270  dtype << "('" << attr << "', '" << ENDIAN_MARKER << type << "'), ";
271  }
272  dtype << ']';
273  }
274  return dtype.str();
275 }
276 
280 template <typename T>
282  if (!attrs.empty()) {
283  if (attrs.size() != shape.back()) {
284  throw std::out_of_range("Last axis does not match number of attribute names");
285  }
286  shape.pop_back();
287  }
288  // Serialize header as a Python dict
289  std::stringstream header;
290  header << "{"
291  << "'descr': " << typeDescription(NpyDtype<T>::str, attrs) << ", 'fortran_order': False, 'shape': " << npyShape(shape)
292  << "}";
293  auto header_str = header.str();
294  little_uint32_t header_len = header_str.size();
295 
296  // Pad header with spaces so the header block is 64 bytes aligned
297  size_t total_length = sizeof(NPY_MAGIC) + sizeof(NPY_VERSION) + sizeof(header_len) + header_len + 1; // Keep 1 for \n
298  size_t padding = 64 - total_length % 64;
299  if (padding) {
300  header << std::string(padding, '\x20');
301  }
302  header << '\n';
303  header_str = header.str();
304  header_len = header_str.size();
305 
306  // Magic and version
307  out.write(NPY_MAGIC, sizeof(NPY_MAGIC));
308  out.write(reinterpret_cast<const char*>(&NPY_VERSION), sizeof(NPY_VERSION));
309 
310  // HEADER_LEN
311  out.write(reinterpret_cast<char*>(&header_len), sizeof(header_len));
312 
313  // HEADER
314  out.write(header_str.data(), header_str.size());
315 }
316 
323 template <typename T>
325 public:
326  MappedContainer(const boost::filesystem::path& path, size_t data_offset, size_t n_elements,
327  const std::vector<std::string>& attr_names, boost::iostreams::mapped_file&& input, size_t max_size)
328  : m_path(path)
329  , m_data_offset(data_offset)
330  , m_n_elements(n_elements)
331  , m_max_size(max_size)
332  , m_attr_names(attr_names)
333  , m_mapped(std::move(input))
334  , m_data(reinterpret_cast<T*>(m_mapped.data() + data_offset)) {}
335 
336  size_t size() const {
337  return m_n_elements;
338  }
339 
340  T* data() {
341  return m_data;
342  }
343 
344  void resize(const std::vector<size_t>& shape) {
345  // Generate header
346  std::stringstream header;
347  writeNpyHeader<T>(header, shape, m_attr_names);
348  auto header_str = header.str();
349  auto header_size = header_str.size();
350  // Make sure we are in place
351  if (header_size != m_data_offset) {
352  throw Elements::Exception() << "Can not resize memory mapped NPY file. "
353  "The new header length must match the allocated space.";
354  }
355 
356  m_n_elements = std::accumulate(shape.begin(), shape.end(), 1u, std::multiplies<size_t>());
357  size_t new_size = header_size + sizeof(T) * m_n_elements;
358  if (new_size > m_max_size) {
359  throw Elements::Exception() << "resize request bigger than maximum allocated size: " << new_size << " > " << m_max_size;
360  }
361  boost::filesystem::resize_file(m_path, new_size);
362  std::copy(header_str.begin(), header_str.end(), m_mapped.data());
363  }
364 
365 private:
366  boost::filesystem::path m_path;
369  boost::iostreams::mapped_file m_mapped;
370  T* m_data;
371 };
372 
373 } // end of namespace NdArray
374 } // end of namespace Euclid
375 
376 #endif // ALEXANDRIA_NDARRAY_IMPL_NPYCOMMON_H
std::string
STL class.
Euclid::NdArray::NPY_MAGIC
constexpr const char NPY_MAGIC[]
Definition: NpyCommon.h:37
Euclid::NdArray::NPY_VERSION
constexpr const uint8_t NPY_VERSION[]
Definition: NpyCommon.h:246
std::vector< std::string >
std::string::find
T find(T... args)
std::vector::size
T size(T... args)
Euclid::NdArray::typeDescription
std::string typeDescription(const std::string &type, const std::vector< std::string > &attrs)
Definition: NpyCommon.h:263
Euclid::NdArray::MappedContainer::m_path
boost::filesystem::path m_path
Definition: NpyCommon.h:366
Euclid::NdArray::NpyDtype
Definition: NpyCommon.h:54
std::stringstream
STL class.
Euclid::NdArray::writeNpyHeader
void writeNpyHeader(std::ostream &out, std::vector< size_t > shape, const std::vector< std::string > &attrs)
Definition: NpyCommon.h:281
Euclid::NdArray::MappedContainer::m_max_size
size_t m_max_size
Definition: NpyCommon.h:367
std::vector::back
T back(T... args)
Euclid::NdArray::MappedContainer::size
size_t size() const
Definition: NpyCommon.h:336
std::istream::read
T read(T... args)
Euclid::NdArray::MappedContainer::m_mapped
boost::iostreams::mapped_file m_mapped
Definition: NpyCommon.h:369
std::string::front
T front(T... args)
std::ostream::write
T write(T... args)
std::multiplies
Euclid::NdArray::MappedContainer::m_n_elements
size_t m_n_elements
Definition: NpyCommon.h:367
Euclid::NdArray::parseNpyDict
void parseNpyDict(const std::string &header, bool &fortran_order, bool &big_endian, std::string &dtype, std::vector< size_t > &shape, std::vector< std::string > &attrs, size_t &n_elements)
Definition: NpyCommon.h:163
Euclid::NdArray::parseFieldValues
void parseFieldValues(const std::string &descr, bool &big_endian, std::vector< std::string > &attrs, std::string &dtype)
Definition: NpyCommon.h:122
std::ostream
STL class.
Euclid::NdArray::ENDIAN_MARKER
constexpr const char * ENDIAN_MARKER
Definition: NpyCommon.h:43
Euclid::NdArray::MappedContainer
Definition: NpyCommon.h:324
Euclid::NdArray::MappedContainer::MappedContainer
MappedContainer(const boost::filesystem::path &path, size_t data_offset, size_t n_elements, const std::vector< std::string > &attr_names, boost::iostreams::mapped_file &&input, size_t max_size)
Definition: NpyCommon.h:326
std::copy
T copy(T... args)
Elements::Exception
std::accumulate
T accumulate(T... args)
std::invalid_argument
STL class.
std::vector::pop_back
T pop_back(T... args)
Euclid::NdArray::MappedContainer::data
T * data()
Definition: NpyCommon.h:340
StringUtils.h
Euclid::NdArray::MappedContainer::resize
void resize(const std::vector< size_t > &shape)
Definition: NpyCommon.h:344
std::string::substr
T substr(T... args)
Euclid::NdArray::MappedContainer::m_data_offset
size_t m_data_offset
Definition: NpyCommon.h:367
std::vector::emplace_back
T emplace_back(T... args)
Euclid::NdArray::npyShape
std::string npyShape(std::vector< size_t > shape)
Definition: NpyCommon.h:253
std::string::begin
T begin(T... args)
std
STL namespace.
s
constexpr double s
std::string::empty
T empty(T... args)
std::out_of_range
STL class.
std::stringstream::str
T str(T... args)
Euclid::NdArray::MappedContainer::m_data
T * m_data
Definition: NpyCommon.h:370
std::string::end
T end(T... args)
std::memcmp
T memcmp(T... args)
path
Elements::Path::Item path
Euclid::NdArray::parseSingleValue
void parseSingleValue(const std::string &descr, bool &big_endian, std::string &dtype)
Definition: NpyCommon.h:109
std::istream
STL class.
Euclid::NdArray::MappedContainer::m_attr_names
std::vector< std::string > m_attr_names
Definition: NpyCommon.h:368
Euclid
Definition: InstOrRefHolder.h:29
Euclid::NdArray::readNpyHeader
void readNpyHeader(std::istream &input, std::string &dtype, std::vector< size_t > &shape, std::vector< std::string > &attrs, size_t &n_elements)
Definition: NpyCommon.h:203