GCC Code Coverage Report


Directory: gridformat/
File: gridformat/xml/parser.hpp
Date: 2024-11-20 14:41:59
Exec Total Coverage
Lines: 130 140 92.9%
Functions: 14 14 100.0%
Branches: 144 287 50.2%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2022-2023 Dennis Gläser <dennis.glaeser@iws.uni-stuttgart.de>
2 // SPDX-License-Identifier: MIT
3 /*!
4 * \ingroup XML
5 * \copydoc GridFormat::XMLParser
6 */
7 #ifndef GRIDFORMAT_XML_PARSER_HPP_
8 #define GRIDFORMAT_XML_PARSER_HPP_
9
10 #include <cmath>
11 #include <string>
12 #include <memory>
13 #include <istream>
14 #include <fstream>
15 #include <type_traits>
16 #include <unordered_map>
17 #include <functional>
18 #include <optional>
19 #include <concepts>
20 #include <limits>
21
22 #include <gridformat/common/exceptions.hpp>
23 #include <gridformat/common/istream_helper.hpp>
24 #include <gridformat/xml/element.hpp>
25 #include <gridformat/xml/tag.hpp>
26
27 namespace GridFormat {
28
29
30 /*!
31 * \ingroup XML
32 * \brief Parses an XML file into an XMLElement.
33 * \note Discards any comments.
34 * \note Creates a single root element in which the parsed elements are placed.
35 * \note The XML element contents are not read. Instead, their bounds within the input stream
36 * are stored separately and the content can be retrieved via get_content(const XMLElement&).
37 * \note Content inside XML elements is assumed to be either before or after child elements. If multiple
38 pieces of content are intermingled with child elements, only the first piece of content will be detected.
39 * \note This implementation is not a fully-fleshed XML parser, but suffices for our requirements.
40 * It is likely to fail when textual content that can be mistaken for xml is inside the elements.
41 */
42 class XMLParser {
43 public:
44 using ContentSkipFunction = std::function<bool(const XMLElement&)>;
45
46 struct StreamBounds {
47 std::streamsize begin_pos;
48 std::streamsize end_pos;
49 };
50
51 /*!
52 * \brief Parse an xml tree from the data in the file with the given name.
53 * \param filename The name of the xml file.
54 * \param root_name The name of the root element in which to place the read xml (default: "ROOT")
55 * \param skip_content_parsing A function that takes an xml element and returns true if the content
56 * of that element should not be parsed for child nodes. This is useful
57 * if the content of an element is very large and potentially invalid xml.
58 */
59 1932 explicit XMLParser(const std::string& filename,
60 const std::string& root_name = "ROOT",
61 const ContentSkipFunction& skip_content_parsing = [] (const XMLElement&) { return false; })
62 1932 : _owned{std::make_unique<std::ifstream>()}
63 3864 , _helper{*_owned}
64
1/2
✓ Branch 1 taken 1932 times.
✗ Branch 2 not taken.
1932 , _element{root_name}
65
1/2
✓ Branch 1 taken 1932 times.
✗ Branch 2 not taken.
1932 , _skip_content{skip_content_parsing} {
66
1/2
✓ Branch 2 taken 1932 times.
✗ Branch 3 not taken.
1932 _owned->open(filename);
67
4/6
✓ Branch 1 taken 3864 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 3864 times.
✗ Branch 5 not taken.
✓ Branch 8 taken 1932 times.
✓ Branch 9 taken 1932 times.
9660 while (_parse_next_element(_element)) {}
68 1932 }
69
70 //! Overload for reading from an existing stream
71 1 explicit XMLParser(std::istream& stream,
72 const std::string& root_name = "ROOT",
73 5 const ContentSkipFunction& skip_content_parsing = [] (const XMLElement&) { return false; })
74 1 : _owned{}
75 2 , _helper{stream}
76
1/2
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
1 , _element{root_name}
77
1/2
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
1 , _skip_content{skip_content_parsing} {
78
4/6
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✓ Branch 8 taken 1 times.
✓ Branch 9 taken 1 times.
5 while (_parse_next_element(_element)) {}
79 1 }
80
81 //! Return a reference the read xml representation
82 103218 const XMLElement& get_xml() const & {
83 103218 return _element;
84 }
85
86 //! Return the read xml representation as an rvalue
87 XMLElement&& get_xml() && {
88 return std::move(_element);
89 }
90
91 //! Return true if a content was read for the given xml element
92 8 bool has_content(const XMLElement& e) const {
93
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 return _content_bounds.contains(&e);
94 }
95
96 //! Return the stream bounds for the content of the given xml element
97 10542 const StreamBounds& get_content_bounds(const XMLElement& e) const {
98
1/2
✓ Branch 1 taken 10542 times.
✗ Branch 2 not taken.
10542 return _content_bounds.at(&e);
99 }
100
101 //! Read and return the content of the given xml element
102 8 std::string read_content_for(const XMLElement& e, const std::optional<std::size_t> max_chars = {}) {
103
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 const auto& bounds = _content_bounds.at(&e);
104 8 const auto content_size = bounds.end_pos - bounds.begin_pos;
105 8 const auto num_chars = std::min(static_cast<std::size_t>(content_size), max_chars.value_or(content_size));
106
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 _helper.seek_position(bounds.begin_pos);
107
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
16 return _helper.read_chunk(num_chars);
108 }
109
110 private:
111 // parse the content or child elements from the stream and add them to the given parent element
112 15571 void _parse_content(XMLElement& parent) {
113
1/2
✓ Branch 2 taken 15571 times.
✗ Branch 3 not taken.
15571 const std::string close_tag = "</" + parent.name();
114
1/2
✓ Branch 1 taken 15571 times.
✗ Branch 2 not taken.
15571 auto content_begin_pos = _helper.position();
115 15571 auto content_end_pos = content_begin_pos;
116
117
3/4
✓ Branch 1 taken 15571 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 1085 times.
✓ Branch 4 taken 14486 times.
15571 if (_skip_content(parent)) {
118
2/4
✓ Branch 1 taken 1085 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 1085 times.
1085 if (!_helper.shift_until_substr(close_tag))
119 throw IOError("Could not find closing tag: " + close_tag);
120
1/2
✓ Branch 1 taken 1085 times.
✗ Branch 2 not taken.
1085 content_end_pos = _helper.position();
121
1/2
✓ Branch 2 taken 1085 times.
✗ Branch 3 not taken.
1085 _helper.shift_by(close_tag.size());
122 } else {
123 // check for content before the first child
124
3/6
✓ Branch 2 taken 14486 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 14486 times.
✗ Branch 6 not taken.
✗ Branch 8 not taken.
✓ Branch 9 taken 14486 times.
43458 if (!_helper.shift_until_any_of("<"))
125 throw IOError("Could not find closing tag for '" + parent.name() + "'");
126
1/2
✓ Branch 1 taken 14486 times.
✗ Branch 2 not taken.
14486 content_end_pos = _helper.position();
127
1/2
✓ Branch 1 taken 14486 times.
✗ Branch 2 not taken.
14486 _helper.seek_position(content_begin_pos);
128
1/2
✓ Branch 1 taken 14486 times.
✗ Branch 2 not taken.
14486 _helper.shift_whitespace();
129
1/2
✓ Branch 1 taken 14486 times.
✗ Branch 2 not taken.
14486 const bool have_read_content = _helper.position() < content_end_pos;
130
1/2
✓ Branch 1 taken 14486 times.
✗ Branch 2 not taken.
14486 _helper.seek_position(content_end_pos);
131
132 // parse all children
133 14486 std::optional<std::streamsize> position_after_last_child;
134 while (true) {
135
1/2
✓ Branch 1 taken 50262 times.
✗ Branch 2 not taken.
50262 auto pos = _parse_next_element(parent, close_tag);
136
2/2
✓ Branch 1 taken 35776 times.
✓ Branch 2 taken 14486 times.
50262 if (pos) position_after_last_child = pos;
137 14486 else break;
138 35776 }
139
140 // (maybe) check for content after the children
141
6/6
✓ Branch 0 taken 12861 times.
✓ Branch 1 taken 1625 times.
✓ Branch 3 taken 12357 times.
✓ Branch 4 taken 504 times.
✓ Branch 5 taken 12357 times.
✓ Branch 6 taken 2129 times.
14486 if (!have_read_content && position_after_last_child.has_value()) {
142
1/2
✓ Branch 1 taken 12357 times.
✗ Branch 2 not taken.
12357 content_begin_pos = position_after_last_child.value();
143
1/2
✓ Branch 1 taken 12357 times.
✗ Branch 2 not taken.
12357 content_end_pos = _helper.position();
144 }
145
146
2/4
✓ Branch 2 taken 14486 times.
✗ Branch 3 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 14486 times.
14486 if (_helper.read_chunk(close_tag.size()) != close_tag)
147 throw IOError("Could not find closing tag for '" + parent.name() + "'");
148
3/6
✓ Branch 2 taken 14486 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 14486 times.
✗ Branch 6 not taken.
✗ Branch 8 not taken.
✓ Branch 9 taken 14486 times.
43458 if (!_helper.shift_until_any_of(">"))
149 throw IOError("Could not find closing tag for '" + parent.name() + "'");
150
2/4
✓ Branch 1 taken 14486 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 14486 times.
✗ Branch 4 not taken.
14486 if (!_helper.is_end_of_file())
151
1/2
✓ Branch 1 taken 14486 times.
✗ Branch 2 not taken.
14486 _helper.shift_by(1);
152 }
153
154
1/2
✓ Branch 1 taken 15571 times.
✗ Branch 2 not taken.
15571 _content_bounds[&parent] = StreamBounds{.begin_pos = content_begin_pos, .end_pos = content_end_pos};
155 15571 }
156
157 // parse the next child element from the stream and return the position after it (or none if no child found)
158 54128 std::optional<std::streamsize> _parse_next_element(XMLElement& parent, const std::string& close_tag = "") {
159 while (true) {
160
3/4
✓ Branch 1 taken 55851 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 1507 times.
✓ Branch 4 taken 54344 times.
55851 if (_helper.is_end_of_file()) {
161
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1507 times.
1507 if (!close_tag.empty())
162 throw IOError("Did not find closing tag: " + close_tag);
163 1507 return {};
164 }
165
166
4/6
✓ Branch 2 taken 54344 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 54344 times.
✗ Branch 6 not taken.
✓ Branch 8 taken 426 times.
✓ Branch 9 taken 53918 times.
163032 if (!_helper.shift_until_any_of("<")) {
167
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 426 times.
426 if (!close_tag.empty())
168 throw IOError("Did not find closing tag: " + close_tag);
169 426 return {};
170 }
171
172
1/2
✓ Branch 1 taken 53918 times.
✗ Branch 2 not taken.
53918 const auto cur_pos = _helper.position();
173
1/2
✓ Branch 1 taken 53918 times.
✗ Branch 2 not taken.
53918 const auto chunk = _helper.read_chunk(4);
174
2/2
✓ Branch 1 taken 1720 times.
✓ Branch 2 taken 52198 times.
53918 if (chunk.starts_with("<?"))
175 1720 continue;
176
177
2/2
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 52195 times.
52198 if (chunk.starts_with("<!--")) {
178
1/2
✓ Branch 1 taken 3 times.
✗ Branch 2 not taken.
3 _helper.seek_position(cur_pos);
179
1/2
✓ Branch 1 taken 3 times.
✗ Branch 2 not taken.
3 _skip_comment();
180 3 continue;
181 }
182
183
1/2
✓ Branch 1 taken 52195 times.
✗ Branch 2 not taken.
52195 _helper.seek_position(cur_pos);
184
2/2
✓ Branch 1 taken 50262 times.
✓ Branch 2 taken 1933 times.
52195 if (!close_tag.empty()) {
185
3/4
✓ Branch 2 taken 50262 times.
✗ Branch 3 not taken.
✓ Branch 6 taken 14486 times.
✓ Branch 7 taken 35776 times.
50262 if (_helper.read_chunk(close_tag.size()) == close_tag) {
186
1/2
✓ Branch 1 taken 14486 times.
✗ Branch 2 not taken.
14486 _helper.seek_position(cur_pos);
187 14486 return {};
188 }
189
1/2
✓ Branch 1 taken 35776 times.
✗ Branch 2 not taken.
35776 _helper.seek_position(cur_pos);
190 }
191
192
2/4
✓ Branch 1 taken 37709 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 37709 times.
✗ Branch 4 not taken.
37709 if (_parse_element(parent))
193
1/2
✓ Branch 1 taken 37709 times.
✗ Branch 2 not taken.
37709 return {_helper.position()};
194
2/3
✗ Branch 1 not taken.
✓ Branch 2 taken 52195 times.
✓ Branch 3 taken 1723 times.
55641 }
195 }
196
197 // skip beyond an xml comment in the input stream
198 3 void _skip_comment() {
199 static constexpr auto comment_begin = "<!--";
200 static constexpr auto comment_end = "-->";
201 3 std::string comment_chunk;
202
203 7 const auto append_until_closing_brace = [&] () -> void {
204
3/6
✓ Branch 2 taken 7 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 7 times.
✗ Branch 6 not taken.
✓ Branch 8 taken 7 times.
✗ Branch 9 not taken.
14 comment_chunk += _helper.read_until_any_of(">");
205
1/2
✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.
7 if (!_helper.is_end_of_file())
206
2/4
✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 7 times.
✗ Branch 5 not taken.
7 comment_chunk += _helper.read_chunk(1); // read the actual ">"
207 7 };
208
209 6 const auto count = [&] (const std::string& substr) {
210 6 std::size_t count = 0;
211 6 auto found_pos = comment_chunk.find(substr);
212
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 6 times.
12 while (found_pos != std::string::npos) {
213 6 found_pos = comment_chunk.find(substr, found_pos + 1);
214 6 count++;
215 }
216 6 return count;
217 3 };
218
219
1/2
✓ Branch 1 taken 3 times.
✗ Branch 2 not taken.
3 append_until_closing_brace();
220
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3 times.
3 if (!comment_chunk.starts_with(comment_begin))
221 throw ValueError("Stream is not at a comment start position");
222
223
15/26
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 4 times.
✓ Branch 4 taken 3 times.
✗ Branch 5 not taken.
✓ Branch 8 taken 3 times.
✗ Branch 9 not taken.
✗ Branch 11 not taken.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 3 times.
✓ Branch 14 taken 4 times.
✓ Branch 16 taken 3 times.
✓ Branch 17 taken 4 times.
✓ Branch 18 taken 3 times.
✓ Branch 19 taken 4 times.
✓ Branch 21 taken 3 times.
✓ Branch 22 taken 4 times.
✓ Branch 23 taken 4 times.
✓ Branch 24 taken 3 times.
✗ Branch 25 not taken.
✗ Branch 26 not taken.
✗ Branch 28 not taken.
✗ Branch 29 not taken.
✗ Branch 30 not taken.
✗ Branch 31 not taken.
✗ Branch 33 not taken.
✗ Branch 34 not taken.
19 while (!comment_chunk.ends_with(comment_end) || count(comment_begin) != count(comment_end))
224
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 append_until_closing_brace();
225 3 }
226
227 // try to parse a single element and return true/false if succeeded
228 37709 bool _parse_element(XMLElement& parent) {
229
1/2
✓ Branch 1 taken 37709 times.
✗ Branch 2 not taken.
37709 const auto begin_pos = _helper.position();
230
2/4
✓ Branch 1 taken 37709 times.
✗ Branch 2 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 37709 times.
37709 if (!_helper.read_chunk(1).starts_with("<")) {
231 _helper.seek_position(begin_pos);
232 return false;
233 }
234
235
1/2
✓ Branch 1 taken 37709 times.
✗ Branch 2 not taken.
37709 _helper.seek_position(begin_pos);
236
1/2
✓ Branch 1 taken 37709 times.
✗ Branch 2 not taken.
37709 _helper.shift_by(1);
237
2/4
✓ Branch 2 taken 37709 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 37709 times.
✗ Branch 6 not taken.
75418 std::string name = _helper.read_until_any_of(" />");
238
1/2
✓ Branch 3 taken 37709 times.
✗ Branch 4 not taken.
37709 auto& element = parent.add_child(std::move(name));
239
240 while (true) {
241
2/4
✓ Branch 1 taken 170364 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 170364 times.
✗ Branch 5 not taken.
340728 _helper.shift_until_not_any_of(" \n");
242
1/2
✓ Branch 1 taken 170364 times.
✗ Branch 2 not taken.
170364 const auto cur_pos = _helper.position();
243
4/6
✓ Branch 1 taken 170364 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 170364 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 22138 times.
✓ Branch 8 taken 148226 times.
170364 if (_helper.read_chunk(2) == "/>")
244 22138 break;
245
246
1/2
✓ Branch 1 taken 148226 times.
✗ Branch 2 not taken.
148226 _helper.seek_position(cur_pos);
247
4/6
✓ Branch 1 taken 148226 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 148226 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 15571 times.
✓ Branch 8 taken 132655 times.
148226 if (_helper.read_chunk(1) == ">") {
248
1/2
✓ Branch 1 taken 15571 times.
✗ Branch 2 not taken.
15571 _parse_content(element);
249 15571 break;
250 }
251
252
1/2
✓ Branch 1 taken 132655 times.
✗ Branch 2 not taken.
132655 _helper.seek_position(cur_pos);
253
1/2
✓ Branch 1 taken 132655 times.
✗ Branch 2 not taken.
132655 auto [name, value] = _read_attribute();
254
1/2
✓ Branch 4 taken 132655 times.
✗ Branch 5 not taken.
132655 element.set_attribute(std::move(name), std::move(value));
255 132655 }
256
257 37709 return true;
258 37709 }
259
260 132655 std::pair<std::string, std::string> _read_attribute() {
261
2/4
✓ Branch 2 taken 132655 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 132655 times.
✗ Branch 6 not taken.
265310 std::string attr_name = _helper.read_until_any_of("= ");
262
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 132655 times.
132655 if (attr_name.empty())
263 throw IOError("Could not parse attribute name");
264
265
2/4
✓ Branch 2 taken 132655 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 132655 times.
✗ Branch 6 not taken.
265310 _helper.shift_until_any_of("\"");
266
1/2
✓ Branch 1 taken 132655 times.
✗ Branch 2 not taken.
132655 _helper.shift_by(1);
267
2/4
✓ Branch 2 taken 132655 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 132655 times.
✗ Branch 6 not taken.
265310 std::string attr_value = _helper.read_until_any_of("\"");
268
1/2
✓ Branch 1 taken 132655 times.
✗ Branch 2 not taken.
132655 _helper.shift_by(1);
269
270 265310 return std::make_pair(std::move(attr_name), std::move(attr_value));
271 132655 }
272
273 std::unique_ptr<std::ifstream> _owned;
274 InputStreamHelper _helper;
275 XMLElement _element;
276 ContentSkipFunction _skip_content;
277 std::unordered_map<const XMLElement*, StreamBounds> _content_bounds;
278 };
279
280 } // end namespace GridFormat
281
282 #endif // GRIDFORMAT_XML_PARSER_HPP_
283