Alexandria  2.19
Please provide a description of the project.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
AsciiReaderHelper.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2012-2021 Euclid Science Ground Segment
3  *
4  * This library is free software; you can redistribute it and/or modify it under
5  * the terms of the GNU Lesser General Public License as published by the Free
6  * Software Foundation; either version 3.0 of the License, or (at your option)
7  * any later version.
8  *
9  * This library is distributed in the hope that it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
12  * details.
13  *
14  * You should have received a copy of the GNU Lesser General Public License
15  * along with this library; if not, write to the Free Software Foundation, Inc.,
16  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
25 #include "AsciiReaderHelper.h"
27 #include "ElementsKernel/Logging.h"
28 #include "NdArray/NdArray.h"
29 #include <boost/algorithm/string.hpp>
30 #include <boost/lexical_cast.hpp>
31 #include <boost/tokenizer.hpp>
32 #include <set>
33 #include <sstream>
34 
35 namespace Euclid {
36 namespace Table {
37 
38 using NdArray::NdArray;
39 
41 
42 size_t countColumns(std::istream& in, const std::string& comment) {
43  StreamRewinder rewinder{in};
44  size_t count = 0;
45 
46  while (in) {
47  std::string line;
48  getline(in, line);
49  // Remove any comments
50  size_t comment_pos = line.find(comment);
51  if (comment_pos != std::string::npos) {
52  line = line.substr(0, comment_pos);
53  }
54  boost::trim(line);
55  if (!line.empty()) {
56  std::string token;
57  std::stringstream line_stream(line);
58  line_stream >> boost::io::quoted(token);
59  while (line_stream) {
60  line_stream >> boost::io::quoted(token);
61  ++count;
62  }
63  break;
64  }
65  }
66  if (count == 0) {
67  throw Elements::Exception() << "No data lines found";
68  }
69  return count;
70 }
71 
73  if (keyword == "bool" || keyword == "boolean") {
74  return typeid(bool);
75  } else if (keyword == "int" || keyword == "int32") {
76  return typeid(int32_t);
77  } else if (keyword == "long" || keyword == "int64") {
78  return typeid(int64_t);
79  } else if (keyword == "float") {
80  return typeid(float);
81  } else if (keyword == "double") {
82  return typeid(double);
83  } else if (keyword == "string") {
84  return typeid(std::string);
85  } else if (keyword == "[bool]" || keyword == "[boolean]") {
86  return typeid(std::vector<bool>);
87  } else if (keyword == "[int]" || keyword == "[int32]") {
88  return typeid(std::vector<int32_t>);
89  } else if (keyword == "[long]" || keyword == "[int64]") {
90  return typeid(std::vector<int64_t>);
91  } else if (keyword == "[float]") {
92  return typeid(std::vector<float>);
93  } else if (keyword == "[double]") {
94  return typeid(std::vector<double>);
95  } else if (keyword == "[int+]" || keyword == "[int32+]") {
96  return typeid(NdArray<int32_t>);
97  } else if (keyword == "[long+]" || keyword == "[int64+]") {
98  return typeid(NdArray<int64_t>);
99  } else if (keyword == "[float+]") {
100  return typeid(NdArray<float>);
101  } else if (keyword == "[double+]") {
102  return typeid(NdArray<double>);
103  }
104  throw Elements::Exception() << "Unknown column type keyword " << keyword;
105 }
106 
108  StreamRewinder rewinder{in};
110  while (in) {
111  std::string line;
112  getline(in, line);
113  boost::trim(line);
114  if (line.empty()) {
115  continue; // We skip empty lines
116  }
117  if (boost::starts_with(line, comment)) {
118  // If we have a comment we remove all comment characters and check if we have
119  // a column description
120  boost::replace_all(line, comment, "");
121  boost::trim(line);
122  if (boost::starts_with(line, "Column:")) {
123  line.erase(0, 7);
124  boost::trim(line);
125  if (!line.empty()) {
126  std::string token;
127  std::stringstream line_stream(line);
128  std::string name;
129  line_stream >> boost::io::quoted(name);
130  if (descriptions.count(name) != 0) {
131  throw Elements::Exception() << "Duplicate column name " << name;
132  }
133  line_stream >> boost::io::quoted(token);
134  std::type_index type = typeid(std::string);
135  if (line_stream) {
136  if (!boost::starts_with(token, "(") && token != "-") {
137  type = keywordToType(token);
138  line_stream >> boost::io::quoted(token);
139  }
140  }
141  std::string unit = "";
142  if (line_stream) {
143  if (boost::starts_with(token, "(")) {
144  unit = token;
145  unit.erase(unit.begin());
146  unit.erase(unit.end() - 1);
147  line_stream >> boost::io::quoted(token);
148  }
149  }
150  if (line_stream && token == "-") {
151  line_stream >> boost::io::quoted(token);
152  }
153  std::stringstream desc;
154  while (line_stream) {
155  desc << token << ' ';
156  line_stream >> boost::io::quoted(token);
157  }
158  std::string desc_str = desc.str();
159  boost::trim(desc_str);
160  descriptions.emplace(std::piecewise_construct, std::forward_as_tuple(name),
161  std::forward_as_tuple(name, type, unit, desc_str));
162  }
163  }
164  } else {
165  break; // here we reached the first data line
166  }
167  }
168  return descriptions;
169 }
170 
171 std::vector<std::string> autoDetectColumnNames(std::istream& in, const std::string& comment, size_t columns_number) {
172  StreamRewinder rewinder{in};
173  std::vector<std::string> names{};
174 
175  // Find the last comment line and at the same time read the names of the
176  // column info description comments
177  std::string last_comment{};
178  std::vector<std::string> desc_names{};
179  while (in) {
180  std::string line;
181  getline(in, line);
182  boost::trim(line);
183  if (line.empty()) {
184  continue; // We skip empty lines
185  }
186  if (boost::starts_with(line, comment)) {
187  // If we have a comment we remove all comment characters and check if we have
188  // the correct number of tokens
189  boost::replace_all(line, comment, "");
190  boost::trim(line);
191  if (!line.empty()) {
192  last_comment = line;
193  }
194  if (boost::starts_with(line, "Column:")) {
195  std::string temp = line;
196  temp.erase(0, 7);
197  boost::trim(temp);
198  auto space_i = temp.find(' ');
199  if (space_i > 0) {
200  temp = temp.substr(0, space_i);
201  }
202  desc_names.emplace_back(std::move(temp));
203  }
204  } else {
205  break; // here we reached the first data line
206  }
207  }
208 
209  // Check if the last comment line contains the names of the columns
210  if (!last_comment.empty()) {
211  std::stringstream line_stream(last_comment);
212  std::string token;
213  line_stream >> boost::io::quoted(token);
214  while (line_stream) {
215  names.push_back(token);
216  line_stream >> boost::io::quoted(token);
217  }
218  if (names.size() != columns_number) {
219  names.clear();
220  }
221  }
222 
223  // If the names are empty we fill them with the column descriprion ones
224  if (names.empty()) {
225  if (desc_names.size() != 0 && desc_names.size() != columns_number) {
226  logger.warn() << "Number of column descriptions does not matches the number"
227  << " of the columns";
228  }
229  names = desc_names;
230  }
231 
232  if (names.size() < columns_number) {
233  for (size_t i = names.size() + 1; i <= columns_number; ++i) {
234  names.push_back("col" + std::to_string(i));
235  }
236  }
237  // Check for duplicate names
238  std::set<std::string> set{};
239  for (auto name : names) {
240  if (!set.insert(name).second) {
241  throw Elements::Exception() << "Duplicate column name " << name;
242  }
243  }
244  return names;
245 }
246 
247 namespace {
248 
249 template <typename T>
250 std::vector<T> convertStringToVector(const std::string& str) {
251  std::vector<T> result{};
252  boost::char_separator<char> sep{","};
253  boost::tokenizer<boost::char_separator<char>> tok{str, sep};
254  for (auto& s : tok) {
255  result.push_back(boost::get<T>(convertToCellType(s, typeid(T))));
256  }
257  return result;
258 }
259 
260 template <typename T>
261 NdArray<T> convertStringToNdArray(const std::string& str) {
262  if (str.empty()) {
263  throw Elements::Exception() << "Cannot convert an empty string to a NdArray";
264  } else if (str[0] != '<') {
265  throw Elements::Exception() << "Unexpected initial character for a NdArray: " << str[0];
266  }
267 
268  auto closing_char = str.find('>');
269  if (closing_char == std::string::npos) {
270  throw Elements::Exception() << "Could not find '>'";
271  }
272 
273  auto shape_str = str.substr(1, closing_char - 1);
274  auto shape_i = convertStringToVector<int32_t>(shape_str);
275  auto data = convertStringToVector<T>(str.substr(closing_char + 1));
276 
277  std::vector<size_t> shape_u;
278  std::copy(shape_i.begin(), shape_i.end(), std::back_inserter(shape_u));
279  return NdArray<T>(shape_u, data);
280 }
281 
282 } // namespace
283 
285  try {
286  if (type == typeid(bool)) {
287  if (value == "true" || value == "t" || value == "yes" || value == "y" || value == "1") {
288  return Row::cell_type{true};
289  }
290  if (value == "false" || value == "f" || value == "no" || value == "n" || value == "0") {
291  return Row::cell_type{false};
292  }
293  } else if (type == typeid(int32_t)) {
294  return Row::cell_type{boost::lexical_cast<int32_t>(value)};
295  } else if (type == typeid(int64_t)) {
296  return Row::cell_type{boost::lexical_cast<int64_t>(value)};
297  } else if (type == typeid(float)) {
298  return Row::cell_type{boost::lexical_cast<float>(value)};
299  } else if (type == typeid(double)) {
300  return Row::cell_type{boost::lexical_cast<double>(value)};
301  } else if (type == typeid(std::string)) {
302  return Row::cell_type{boost::lexical_cast<std::string>(value)};
303  } else if (type == typeid(std::vector<bool>)) {
304  return Row::cell_type{convertStringToVector<bool>(value)};
305  } else if (type == typeid(std::vector<int32_t>)) {
306  return Row::cell_type{convertStringToVector<int32_t>(value)};
307  } else if (type == typeid(std::vector<int64_t>)) {
308  return Row::cell_type{convertStringToVector<int64_t>(value)};
309  } else if (type == typeid(std::vector<float>)) {
310  return Row::cell_type{convertStringToVector<float>(value)};
311  } else if (type == typeid(std::vector<double>)) {
312  return Row::cell_type{convertStringToVector<double>(value)};
313  } else if (type == typeid(NdArray<int32_t>)) {
314  return Row::cell_type{convertStringToNdArray<int32_t>(value)};
315  } else if (type == typeid(NdArray<int64_t>)) {
316  return Row::cell_type{convertStringToNdArray<int64_t>(value)};
317  } else if (type == typeid(NdArray<float>)) {
318  return Row::cell_type{convertStringToNdArray<float>(value)};
319  } else if (type == typeid(NdArray<double>)) {
320  return Row::cell_type{convertStringToNdArray<double>(value)};
321  }
322  } catch (boost::bad_lexical_cast const&) {
323  throw Elements::Exception() << "Cannot convert " << value << " to " << type.name();
324  }
325  throw Elements::Exception() << "Unknown type name " << type.name();
326 }
327 
328 bool hasNextRow(std::istream& in, const std::string& comment) {
329  StreamRewinder rewinder{in};
330  while (in) {
331  std::string line;
332  getline(in, line);
333  size_t comment_pos = line.find(comment);
334  if (comment_pos != std::string::npos) {
335  line = line.substr(0, comment_pos);
336  }
337  boost::trim(line);
338  if (!line.empty()) {
339  return true;
340  }
341  }
342  return false;
343 }
344 
346  StreamRewinder rewinder{in};
347  std::size_t count = 0;
348  while (in) {
349  std::string line;
350  getline(in, line);
351  size_t comment_pos = line.find(comment);
352  if (comment_pos != std::string::npos) {
353  line = line.substr(0, comment_pos);
354  }
355  boost::trim(line);
356  if (!line.empty()) {
357  ++count;
358  }
359  }
360  return count;
361 }
362 
363 } // namespace Table
364 } // end of namespace Euclid
std::size_t countRemainingRows(std::istream &in, const std::string &comment)
T empty(T...args)
T copy(T...args)
T forward_as_tuple(T...args)
T to_string(T...args)
static Elements::Logging logger
T end(T...args)
Row::cell_type convertToCellType(const std::string &value, std::type_index type)
Converts the given value to a Row::cell_type of the given type.
STL class.
STL class.
std::type_index keywordToType(const std::string &keyword)
STL class.
T push_back(T...args)
void warn(const std::string &logMessage)
T erase(T...args)
T str(T...args)
T move(T...args)
std::map< std::string, ColumnDescription > autoDetectColumnDescriptions(std::istream &in, const std::string &comment)
Reads the column descriptions of the given stream.
This class gets a stream as argument during construction and when it is deleted it sets the position ...
T count(T...args)
boost::variant< bool, int32_t, int64_t, float, double, std::string, std::vector< bool >, std::vector< int32_t >, std::vector< int64_t >, std::vector< float >, std::vector< double >, NdArray::NdArray< int32_t >, NdArray::NdArray< int64_t >, NdArray::NdArray< float >, NdArray::NdArray< double > > cell_type
The possible cell types.
Definition: Row.h:71
bool hasNextRow(std::istream &in, const std::string &comment)
T find(T...args)
std::string quoted(const std::string &str)
STL class.
STL class.
T name(T...args)
T begin(T...args)
T back_inserter(T...args)
T emplace(T...args)
T substr(T...args)
static Logging getLogger(const std::string &name="")
size_t countColumns(std::istream &in, const std::string &comment)
Returns the number of whitespace separated tokens of the first non commented line.
std::vector< std::string > autoDetectColumnNames(std::istream &in, const std::string &comment, size_t columns_number)
Reads the column names of the given stream.