Alexandria  2.22.0
Please provide a description of the project.
AsciiReaderHelper.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2012-2021 Euclid Science Ground Segment
3  *
4  * This library is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU Lesser General Public License as published by the Free
6  * Software Foundation; either version 3.0 of the License, or (at your option)
7  * any later version.
8  *
9  * This library is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU Lesser General Public License
15  * along with this library; if not, write to the Free Software Foundation, Inc.,
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
25 #include "AsciiReaderHelper.h"
27 #include "ElementsKernel/Logging.h"
28 #include "NdArray/NdArray.h"
29 #include <boost/algorithm/string.hpp>
30 #include <boost/lexical_cast.hpp>
31 #include <boost/tokenizer.hpp>
32 #include <set>
33 #include <sstream>
34 
35 namespace Euclid {
36 namespace Table {
37 
38 using NdArray::NdArray;
39 
41 
42 size_t countColumns(std::istream& in, const std::string& comment) {
43  StreamRewinder rewinder{in};
44  size_t count = 0;
45 
46  while (in) {
47  std::string line;
48  getline(in, line);
49  // Remove any comments
50  size_t comment_pos = line.find(comment);
51  if (comment_pos != std::string::npos) {
52  line = line.substr(0, comment_pos);
53  }
54  boost::trim(line);
55  if (!line.empty()) {
56  std::string token;
57  std::stringstream line_stream(line);
58  line_stream >> boost::io::quoted(token);
59  while (line_stream) {
60  line_stream >> boost::io::quoted(token);
61  ++count;
62  }
63  break;
64  }
65  }
66  if (count == 0) {
67  throw Elements::Exception() << "No data lines found";
68  }
69  return count;
70 }
71 
75  // Boolean
76  {"bool", typeid(bool)},
77  {"boolean", typeid(bool)},
78  // Integers
79  {"int", typeid(int32_t)},
80  {"long", typeid(int64_t)},
81  {"int32", typeid(int32_t)},
82  {"int64", typeid(int64_t)},
83  // Floating point
84  {"float", typeid(float)},
85  {"double", typeid(double)},
86  // Strings
87  {"string", typeid(std::string)},
88  // Arrays
89  {"[bool]", typeid(std::vector<bool>)},
90  {"[boolean]", typeid(std::vector<bool>)},
91  {"[int]", typeid(std::vector<int32_t>)},
92  {"[long]", typeid(std::vector<int64_t>)},
93  {"[int32]", typeid(std::vector<int32_t>)},
94  {"[int64]", typeid(std::vector<int64_t>)},
95  {"[float]", typeid(std::vector<float>)},
96  {"[double]", typeid(std::vector<double>)},
97  // NdArrays
98  {"[int+]", typeid(NdArray<int32_t>)},
99  {"[long+]", typeid(NdArray<int64_t>)},
100  {"[int32+]", typeid(NdArray<int32_t>)},
101  {"[int64+]", typeid(NdArray<int64_t>)},
102  {"[float+]", typeid(NdArray<float>)},
103  {"[double+]", typeid(NdArray<double>)},
104 };
105 
107  for (auto p = KeywordTypeMap.begin(); p != KeywordTypeMap.end(); ++p) {
108  if (p->first == keyword) {
109  return p->second;
110  }
111  }
112  throw Elements::Exception() << "Unknown column type keyword " << keyword;
113 }
114 
116  StreamRewinder rewinder{in};
118  while (in) {
119  std::string line;
120  getline(in, line);
121  boost::trim(line);
122  if (line.empty()) {
123  continue; // We skip empty lines
124  }
125  if (boost::starts_with(line, comment)) {
126  // If we have a comment we remove all comment characters and check if we have
127  // a column description
128  boost::replace_all(line, comment, "");
129  boost::trim(line);
130  if (boost::starts_with(line, "Column:")) {
131  line.erase(0, 7);
132  boost::trim(line);
133  if (!line.empty()) {
134  std::string token;
135  std::stringstream line_stream(line);
136  std::string name;
137  line_stream >> boost::io::quoted(name);
138  if (descriptions.count(name) != 0) {
139  throw Elements::Exception() << "Duplicate column name " << name;
140  }
141  line_stream >> boost::io::quoted(token);
142  std::type_index type = typeid(std::string);
143  if (line_stream) {
144  if (!boost::starts_with(token, "(") && token != "-") {
145  type = keywordToType(token);
146  line_stream >> boost::io::quoted(token);
147  }
148  }
149  std::string unit = "";
150  if (line_stream) {
151  if (boost::starts_with(token, "(")) {
152  unit = token;
153  unit.erase(unit.begin());
154  unit.erase(unit.end() - 1);
155  line_stream >> boost::io::quoted(token);
156  }
157  }
158  if (line_stream && token == "-") {
159  line_stream >> boost::io::quoted(token);
160  }
161  std::stringstream desc;
162  while (line_stream) {
163  desc << token << ' ';
164  line_stream >> boost::io::quoted(token);
165  }
166  std::string desc_str = desc.str();
167  boost::trim(desc_str);
168  descriptions.emplace(std::piecewise_construct, std::forward_as_tuple(name),
169  std::forward_as_tuple(name, type, unit, desc_str));
170  }
171  }
172  } else {
173  break; // here we reached the first data line
174  }
175  }
176  return descriptions;
177 }
178 
179 std::vector<std::string> autoDetectColumnNames(std::istream& in, const std::string& comment, size_t columns_number) {
180  StreamRewinder rewinder{in};
181  std::vector<std::string> names{};
182 
183  // Find the last comment line and at the same time read the names of the
184  // column info description comments
185  std::string last_comment{};
186  std::vector<std::string> desc_names{};
187  while (in) {
188  std::string line;
189  getline(in, line);
190  boost::trim(line);
191  if (line.empty()) {
192  continue; // We skip empty lines
193  }
194  if (boost::starts_with(line, comment)) {
195  // If we have a comment we remove all comment characters and check if we have
196  // the correct number of tokens
197  boost::replace_all(line, comment, "");
198  boost::trim(line);
199  if (!line.empty()) {
200  last_comment = line;
201  }
202  if (boost::starts_with(line, "Column:")) {
203  std::string temp = line;
204  temp.erase(0, 7);
205  boost::trim(temp);
206  auto space_i = temp.find(' ');
207  if (space_i > 0) {
208  temp = temp.substr(0, space_i);
209  }
210  desc_names.emplace_back(std::move(temp));
211  }
212  } else {
213  break; // here we reached the first data line
214  }
215  }
216 
217  // Check if the last comment line contains the names of the columns
218  if (!last_comment.empty()) {
219  std::stringstream line_stream(last_comment);
220  std::string token;
221  line_stream >> boost::io::quoted(token);
222  while (line_stream) {
223  names.push_back(token);
224  line_stream >> boost::io::quoted(token);
225  }
226  if (names.size() != columns_number) {
227  names.clear();
228  }
229  }
230 
231  // If the names are empty we fill them with the column descriprion ones
232  if (names.empty()) {
233  if (desc_names.size() != 0 && desc_names.size() != columns_number) {
234  logger.warn() << "Number of column descriptions does not matches the number"
235  << " of the columns";
236  }
237  names = desc_names;
238  }
239 
240  if (names.size() < columns_number) {
241  for (size_t i = names.size() + 1; i <= columns_number; ++i) {
242  names.push_back("col" + std::to_string(i));
243  }
244  }
245  // Check for duplicate names
246  std::set<std::string> set{};
247  for (auto name : names) {
248  if (!set.insert(name).second) {
249  throw Elements::Exception() << "Duplicate column name " << name;
250  }
251  }
252  return names;
253 }
254 
255 namespace {
256 
257 template <typename T>
258 std::vector<T> convertStringToVector(const std::string& str) {
259  std::vector<T> result{};
260  boost::char_separator<char> sep{","};
261  boost::tokenizer<boost::char_separator<char>> tok{str, sep};
262  for (auto& s : tok) {
263  result.push_back(boost::get<T>(convertToCellType(s, typeid(T))));
264  }
265  return result;
266 }
267 
268 template <typename T>
269 NdArray<T> convertStringToNdArray(const std::string& str) {
270  if (str.empty()) {
271  throw Elements::Exception() << "Cannot convert an empty string to a NdArray";
272  } else if (str[0] != '<') {
273  throw Elements::Exception() << "Unexpected initial character for a NdArray: " << str[0];
274  }
275 
276  auto closing_char = str.find('>');
277  if (closing_char == std::string::npos) {
278  throw Elements::Exception() << "Could not find '>'";
279  }
280 
281  auto shape_str = str.substr(1, closing_char - 1);
282  auto shape_i = convertStringToVector<int32_t>(shape_str);
283  auto data = convertStringToVector<T>(str.substr(closing_char + 1));
284 
285  std::vector<size_t> shape_u;
286  std::copy(shape_i.begin(), shape_i.end(), std::back_inserter(shape_u));
287  return NdArray<T>(shape_u, data);
288 }
289 
290 } // namespace
291 
293  // Boolean
294  {typeid(bool),
295  [](const std::string& value) {
296  if (value == "true" || value == "t" || value == "yes" || value == "y" || value == "1") {
297  return true;
298  } else if (value == "false" || value == "f" || value == "no" || value == "n" || value == "0") {
299  return false;
300  }
301  throw Elements::Exception() << "Invalid boolean value " << value;
302  }},
303  // Integers
304  {typeid(int32_t), boost::lexical_cast<int32_t, const std::string&>},
305  {typeid(int64_t), boost::lexical_cast<int64_t, const std::string&>},
306  // Floating point
307  {typeid(float), boost::lexical_cast<float, const std::string&>},
308  {typeid(double), boost::lexical_cast<double, const std::string&>},
309  // String
310  {typeid(std::string), boost::lexical_cast<std::string, const std::string&>},
311  // Arrays
312  {typeid(std::vector<bool>), convertStringToVector<bool>},
313  {typeid(std::vector<int32_t>), convertStringToVector<int32_t>},
314  {typeid(std::vector<int64_t>), convertStringToVector<int64_t>},
315  {typeid(std::vector<float>), convertStringToVector<float>},
316  {typeid(std::vector<double>), convertStringToVector<double>},
317  // NdArray
318  {typeid(NdArray<int32_t>), convertStringToNdArray<int32_t>},
319  {typeid(NdArray<int64_t>), convertStringToNdArray<int64_t>},
320  {typeid(NdArray<float>), convertStringToNdArray<float>},
321  {typeid(NdArray<double>), convertStringToNdArray<double>},
322 };
323 
325  try {
326  auto i = sCellConverter.find(type);
327  if (i == sCellConverter.end()) {
328  throw Elements::Exception() << "Unknown type name " << type.name();
329  }
330  return i->second(value);
331  } catch (boost::bad_lexical_cast const&) {
332  throw Elements::Exception() << "Cannot convert " << value << " to " << type.name();
333  }
334 }
335 
336 bool hasNextRow(std::istream& in, const std::string& comment) {
337  StreamRewinder rewinder{in};
338  while (in) {
339  std::string line;
340  getline(in, line);
341  size_t comment_pos = line.find(comment);
342  if (comment_pos != std::string::npos) {
343  line = line.substr(0, comment_pos);
344  }
345  boost::trim(line);
346  if (!line.empty()) {
347  return true;
348  }
349  }
350  return false;
351 }
352 
354  StreamRewinder rewinder{in};
355  std::size_t count = 0;
356  while (in) {
357  std::string line;
358  getline(in, line);
359  size_t comment_pos = line.find(comment);
360  if (comment_pos != std::string::npos) {
361  line = line.substr(0, comment_pos);
362  }
363  boost::trim(line);
364  if (!line.empty()) {
365  ++count;
366  }
367  }
368  return count;
369 }
370 
371 } // namespace Table
372 } // end of namespace Euclid
T back_inserter(T... args)
T begin(T... args)
static Logging getLogger(const std::string &name="")
void warn(const std::string &logMessage)
NdArray(const std::vector< size_t > &shape_)
boost::variant< bool, int32_t, int64_t, float, double, std::string, std::vector< bool >, std::vector< int32_t >, std::vector< int64_t >, std::vector< float >, std::vector< double >, NdArray::NdArray< int32_t >, NdArray::NdArray< int64_t >, NdArray::NdArray< float >, NdArray::NdArray< double > > cell_type
The possible cell types.
Definition: Row.h:71
This class gets a stream as argument during construction and when it is deleted it sets the position ...
T copy(T... args)
T count(T... args)
T emplace(T... args)
T empty(T... args)
T end(T... args)
T erase(T... args)
T find(T... args)
T forward_as_tuple(T... args)
T move(T... args)
T name(T... args)
std::type_index keywordToType(const std::string &keyword)
bool hasNextRow(std::istream &in, const std::string &comment)
std::map< std::string, ColumnDescription > autoDetectColumnDescriptions(std::istream &in, const std::string &comment)
Reads the column descriptions of the given stream.
size_t countColumns(std::istream &in, const std::string &comment)
Returns the number of whitespace separated tokens of the first non commented line.
std::vector< std::string > autoDetectColumnNames(std::istream &in, const std::string &comment, size_t columns_number)
Reads the column names of the given stream.
Row::cell_type convertToCellType(const std::string &value, std::type_index type)
Converts the given value to a Row::cell_type of the given type.
const std::vector< std::pair< std::string, std::type_index > > KeywordTypeMap
std::size_t countRemainingRows(std::istream &in, const std::string &comment)
const std::map< std::type_index, std::function< Row::cell_type(const std::string &)> > sCellConverter
static Elements::Logging logger
std::string quoted(const std::string &str)
T push_back(T... args)
T str(T... args)
T substr(T... args)
T to_string(T... args)