Alexandria  2.14.1
Please provide a description of the project.
AsciiReaderHelper.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2012-2020 Euclid Science Ground Segment
3  *
4  * This library is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU Lesser General Public License as published by the Free
6  * Software Foundation; either version 3.0 of the License, or (at your option)
7  * any later version.
8  *
9  * This library is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU Lesser General Public License
15  * along with this library; if not, write to the Free Software Foundation, Inc.,
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
25 #include <set>
26 #include <sstream>
27 #include <boost/regex.hpp>
28 using boost::regex;
29 using boost::regex_match;
30 #include <boost/algorithm/string.hpp>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/tokenizer.hpp>
34 #include "ElementsKernel/Logging.h"
35 #include "AsciiReaderHelper.h"
36 #include "NdArray/NdArray.h"
37 
38 namespace Euclid {
39 namespace Table {
40 
41 using NdArray::NdArray;
42 
44 
45 size_t countColumns(std::istream& in, const std::string& comment) {
46  StreamRewinder rewinder {in};
47  size_t count = 0;
48  regex column_separator {"\\s+"};
49  while (in) {
50  std::string line;
51  getline(in, line);
52  // Remove any comments
53  size_t comment_pos = line.find(comment);
54  if (comment_pos != std::string::npos) {
55  line = line.substr(0, comment_pos);
56  }
57  boost::trim(line);
58  if (!line.empty()) {
59  boost::sregex_token_iterator i (line.begin(), line.end(), column_separator, -1);
60  boost::sregex_token_iterator j;
61  while (i != j) {
62  ++i;
63  ++count;
64  }
65  break;
66  }
67  }
68  if (count == 0) {
69  throw Elements::Exception() << "No data lines found";
70  }
71  return count;
72 }
73 
75  if (keyword == "bool" || keyword == "boolean") {
76  return typeid(bool);
77  } else if (keyword == "int" || keyword == "int32") {
78  return typeid(int32_t);
79  } else if (keyword == "long" || keyword == "int64") {
80  return typeid(int64_t);
81  } else if (keyword == "float") {
82  return typeid(float);
83  } else if (keyword == "double") {
84  return typeid(double);
85  } else if (keyword == "string") {
86  return typeid(std::string);
87  } else if (keyword == "[bool]" || keyword == "[boolean]") {
88  return typeid(std::vector<bool>);
89  } else if (keyword == "[int]" || keyword == "[int32]") {
90  return typeid(std::vector<int32_t>);
91  } else if (keyword == "[long]" || keyword == "[int64]") {
92  return typeid(std::vector<int64_t>);
93  } else if (keyword == "[float]") {
94  return typeid(std::vector<float>);
95  } else if (keyword == "[double]") {
96  return typeid(std::vector<double>);
97  } else if (keyword == "[bool+]" || keyword == "[boolean+]") {
98  return typeid(NdArray<bool>);
99  } else if (keyword == "[int+]" || keyword == "[int32+]") {
100  return typeid(NdArray<int32_t>);
101  } else if (keyword == "[long+]" || keyword == "[int64+]") {
102  return typeid(NdArray<int64_t>);
103  } else if (keyword == "[float+]") {
104  return typeid(NdArray<float>);
105  } else if (keyword == "[double+]") {
106  return typeid(NdArray<double>);
107  }
108  throw Elements::Exception() << "Unknown column type keyword " << keyword;
109 }
110 
112  std::istream& in, const std::string& comment) {
113  StreamRewinder rewinder {in};
115  while (in) {
116  std::string line;
117  getline(in, line);
118  boost::trim(line);
119  if (line.empty()) {
120  continue; // We skip empty lines
121  }
122  if (boost::starts_with(line, comment)) {
123  // If we have a comment we remove all comment characters and check if we have
124  // a column description
125  boost::replace_all(line, comment, "");
126  boost::trim(line);
127  if (boost::starts_with(line, "Column:")) {
128  line.erase(0, 7);
129  boost::trim(line);
130  if (!line.empty()) {
131  boost::sregex_token_iterator token (line.begin(), line.end(), regex{"\\s+"}, -1);
132  boost::sregex_token_iterator end;
133  std::string name = *token;
134  if (descriptions.count(name) != 0) {
135  throw Elements::Exception() << "Duplicate column name " << name;
136  }
137  ++token;
138  std::type_index type = typeid(std::string);
139  if (token != end) {
140  std::string token_str =*token;
141  if (!boost::starts_with(token_str, "(") && token_str != "-") {
142  type = keywordToType(token_str);
143  ++token;
144  }
145  }
146  std::string unit = "";
147  if (token != end) {
148  std::string token_str = *token;
149  if (boost::starts_with(token_str, "(")) {
150  unit = token_str;
151  unit.erase(unit.begin());
152  unit.erase(unit.end()-1);
153  ++token;
154  }
155  }
156  if (token != end && *token == "-") {
157  ++token;
158  }
159  std::stringstream desc;
160  while (token != end) {
161  desc << *token << ' ';
162  ++token;
163  }
164  std::string desc_str = desc.str();
165  boost::trim(desc_str);
166  descriptions.emplace(std::piecewise_construct,
167  std::forward_as_tuple(name),
168  std::forward_as_tuple(name, type, unit, desc_str));
169  }
170  }
171  } else {
172  break; // here we reached the first data line
173  }
174  }
175  return descriptions;
176 }
177 
179  const std::string& comment,
180  size_t columns_number) {
181  StreamRewinder rewinder {in};
182  std::vector<std::string> names {};
183 
184  // Find the last comment line and at the same time read the names of the
185  // column info description comments
186  std::string last_comment {};
187  std::vector<std::string> desc_names {};
188  while (in) {
189  std::string line;
190  getline(in, line);
191  boost::trim(line);
192  if (line.empty()) {
193  continue; // We skip empty lines
194  }
195  if (boost::starts_with(line, comment)) {
196  // If we have a comment we remove all comment characters and check if we have
197  // the correct number of tokens
198  boost::replace_all(line, comment, "");
199  boost::trim(line);
200  if (!line.empty()) {
201  last_comment = line;
202  }
203  if (boost::starts_with(line, "Column:")) {
204  std::string temp = line;
205  temp.erase(0, 7);
206  boost::trim(temp);
207  auto space_i = temp.find(' ');
208  if (space_i > 0) {
209  temp = temp.substr(0, space_i);
210  }
211  desc_names.emplace_back(std::move(temp));
212  }
213  } else {
214  break; // here we reached the first data line
215  }
216  }
217 
218  // Check if the last comment line contains the names of the columns
219  if (!last_comment.empty()){
220  boost::sregex_token_iterator i (last_comment.begin(), last_comment.end(), regex{"\\s+"}, -1);
221  boost::sregex_token_iterator j;
222  while (i != j) {
223  names.push_back(*i);
224  ++i;
225  }
226  if (names.size() != columns_number) {
227  names.clear();
228  }
229  }
230 
231  // If the names are empty we fill them with the column descriprion ones
232  if (names.empty()) {
233  if (desc_names.size() != 0 && desc_names.size() != columns_number) {
234  logger.warn() << "Number of column descriptions does not matches the number"
235  << " of the columns";
236  }
237  names = desc_names;
238  }
239 
240  if (names.size() < columns_number) {
241  for (size_t i=names.size()+1; i<=columns_number; ++i) {
242  names.push_back("col" + std::to_string(i));
243  }
244  }
245  // Check for duplicate names
246  std::set<std::string> set {};
247  for (auto name : names) {
248  if (!set.insert(name).second) {
249  throw Elements::Exception() << "Duplicate column name " << name;
250  }
251  }
252  return names;
253 }
254 
255 namespace {
256 
257 template <typename T>
258 std::vector<T> convertStringToVector(const std::string& str) {
259  std::vector<T> result {};
260  boost::char_separator<char> sep {","};
261  boost::tokenizer< boost::char_separator<char> > tok {str, sep};
262  for (auto& s : tok) {
263  result.push_back(boost::get<T>(convertToCellType(s, typeid(T))));
264  }
265  return result;
266 }
267 
268 template <typename T>
269 NdArray<T> convertStringToNdArray(const std::string& str) {
270  if (str.empty()) {
271  throw Elements::Exception() << "Cannot convert an empty string to a NdArray";
272  } else if (str[0] != '<') {
273  throw Elements::Exception() << "Unexpected initial character for a NdArray: " << str[0];
274  }
275 
276  auto closing_char = str.find('>');
277  if (closing_char == std::string::npos) {
278  throw Elements::Exception() << "Could not find '>'";
279  }
280 
281  auto shape_str = str.substr(1, closing_char - 1);
282  auto shape_i = convertStringToVector<int32_t>(shape_str);
283  auto data = convertStringToVector<T>(str.substr(closing_char + 1));
284 
285  std::vector<size_t> shape_u;
286  std::copy(shape_i.begin(), shape_i.end(), std::back_inserter(shape_u));
287  return NdArray<T>(shape_u, data);
288 }
289 
290 }
291 
293  try {
294  if (type == typeid(bool)) {
295  if (value == "true" || value == "t" || value == "yes" || value == "y" || value == "1") {
296  return Row::cell_type {true};
297  }
298  if (value == "false" || value == "f" || value == "no" || value == "n" || value == "0") {
299  return Row::cell_type {false};
300  }
301  } else if (type == typeid(int32_t)) {
302  return Row::cell_type {boost::lexical_cast<int32_t>(value)};
303  } else if (type == typeid(int64_t)) {
304  return Row::cell_type {boost::lexical_cast<int64_t>(value)};
305  } else if (type == typeid(float)) {
306  return Row::cell_type {boost::lexical_cast<float>(value)};
307  } else if (type == typeid(double)) {
308  return Row::cell_type {boost::lexical_cast<double>(value)};
309  } else if (type == typeid(std::string)) {
310  return Row::cell_type {boost::lexical_cast<std::string>(value)};
311  } else if (type == typeid(std::vector<bool>)) {
312  return Row::cell_type {convertStringToVector<bool>(value)};
313  } else if (type == typeid(std::vector<int32_t>)) {
314  return Row::cell_type {convertStringToVector<int32_t>(value)};
315  } else if (type == typeid(std::vector<int64_t>)) {
316  return Row::cell_type {convertStringToVector<int64_t>(value)};
317  } else if (type == typeid(std::vector<float>)) {
318  return Row::cell_type {convertStringToVector<float>(value)};
319  } else if (type == typeid(std::vector<double>)) {
320  return Row::cell_type {convertStringToVector<double>(value)};
321  } else if (type == typeid(NdArray<bool>)) {
322  return Row::cell_type {convertStringToNdArray<bool>(value)};
323  } else if (type == typeid(NdArray<int32_t>)) {
324  return Row::cell_type {convertStringToNdArray<int32_t>(value)};
325  } else if (type == typeid(NdArray<int64_t>)) {
326  return Row::cell_type {convertStringToNdArray<int64_t>(value)};
327  } else if (type == typeid(NdArray<float>)) {
328  return Row::cell_type {convertStringToNdArray<float>(value)};
329  } else if (type == typeid(NdArray<double>)) {
330  return Row::cell_type {convertStringToNdArray<double>(value)};
331  }
332  } catch( boost::bad_lexical_cast const& ) {
333  throw Elements::Exception() << "Cannot convert " << value << " to " << type.name();
334  }
335  throw Elements::Exception() << "Unknown type name " << type.name();
336 }
337 
338 bool hasNextRow(std::istream& in, const std::string& comment) {
339  StreamRewinder rewinder {in};
340  while(in) {
341  std::string line;
342  getline(in, line);
343  size_t comment_pos = line.find(comment);
344  if (comment_pos != std::string::npos) {
345  line = line.substr(0, comment_pos);
346  }
347  boost::trim(line);
348  if (!line.empty()) {
349  return true;
350  }
351  }
352  return false;
353 }
354 
356  StreamRewinder rewinder {in};
357  std::size_t count = 0;
358  while(in) {
359  std::string line;
360  getline(in, line);
361  size_t comment_pos = line.find(comment);
362  if (comment_pos != std::string::npos) {
363  line = line.substr(0, comment_pos);
364  }
365  boost::trim(line);
366  if (!line.empty()) {
367  ++count;
368  }
369  }
370  return count;
371 }
372 
373 }
374 } // end of namespace Euclid
Euclid::Table::convertToCellType
Row::cell_type convertToCellType(const std::string &value, std::type_index type)
Converts the given value to a Row::cell_type of the given type.
Definition: AsciiReaderHelper.cpp:292
std::string
STL class.
std::move
T move(T... args)
Euclid::Table::countRemainingRows
std::size_t countRemainingRows(std::istream &in, const std::string &comment)
Definition: AsciiReaderHelper.cpp:355
Euclid::Table::autoDetectColumnDescriptions
std::map< std::string, ColumnDescription > autoDetectColumnDescriptions(std::istream &in, const std::string &comment)
Reads the column descriptions of the given stream.
Definition: AsciiReaderHelper.cpp:111
Elements::Logging
std::vector
STL class.
std::string::find
T find(T... args)
std::back_inserter
T back_inserter(T... args)
std::type_index::name
T name(T... args)
std::map::emplace
T emplace(T... args)
std::type_index
std::stringstream
STL class.
Euclid::NdArray::NdArray
Definition: NdArray.h:45
std::string::push_back
T push_back(T... args)
Euclid::NdArray::NdArray::NdArray
NdArray(const std::vector< size_t > &shape)
Definition: NdArray.h:62
Euclid::Table::hasNextRow
bool hasNextRow(std::istream &in, const std::string &comment)
Definition: AsciiReaderHelper.cpp:338
Euclid::Table::keywordToType
std::type_index keywordToType(const std::string &keyword)
Definition: AsciiReaderHelper.cpp:74
Euclid::Table::autoDetectColumnNames
std::vector< std::string > autoDetectColumnNames(std::istream &in, const std::string &comment, size_t columns_number)
Reads the column names of the given stream.
Definition: AsciiReaderHelper.cpp:178
Exception.h
std::to_string
T to_string(T... args)
Elements::Logging::warn
void warn(const std::string &logMessage)
std::string::erase
T erase(T... args)
std::copy
T copy(T... args)
Elements::Exception
std::forward_as_tuple
T forward_as_tuple(T... args)
std::map
STL class.
AsciiReaderHelper.h
Euclid::Table::Row::cell_type
boost::variant< bool, int32_t, int64_t, float, double, std::string, std::vector< bool >, std::vector< int32_t >, std::vector< int64_t >, std::vector< float >, std::vector< double >, NdArray::NdArray< bool >, NdArray::NdArray< int32_t >, NdArray::NdArray< int64_t >, NdArray::NdArray< float >, NdArray::NdArray< double > > cell_type
The possible cell types.
Definition: Row.h:84
Elements::Logging::getLogger
static Logging getLogger(const std::string &name="")
std::string::substr
T substr(T... args)
Euclid::Table::StreamRewinder
This class gets a stream as argument during construction and when it is deleted it sets the position ...
Definition: AsciiReaderHelper.h:48
NdArray.h
std::string::begin
T begin(T... args)
Euclid::Table::logger
static Elements::Logging logger
Definition: AsciiReaderHelper.cpp:43
std::map::count
T count(T... args)
std::string::empty
T empty(T... args)
std::stringstream::str
T str(T... args)
std::size_t
Logging.h
Euclid::Table::countColumns
size_t countColumns(std::istream &in, const std::string &comment)
Returns the number of whitespace separated tokens of the first non commented line.
Definition: AsciiReaderHelper.cpp:45
std::string::end
T end(T... args)
std::istream
STL class.
Euclid
Definition: InstOrRefHolder.h:29
std::set
STL class.