GCC Code Coverage Report


Directory: gridformat/
File: gridformat/xml/parser.hpp
Date: 2025-03-26 17:08:15
Exec Total Coverage
Lines: 130 140 92.9%
Functions: 14 14 100.0%
Branches: 144 287 50.2%

Line Branch Exec Source
1 // SPDX-FileCopyrightText: 2022-2023 Dennis Gläser <dennis.glaeser@iws.uni-stuttgart.de>
2 // SPDX-License-Identifier: MIT
3 /*!
4 * \ingroup XML
5 * \copydoc GridFormat::XMLParser
6 */
7 #ifndef GRIDFORMAT_XML_PARSER_HPP_
8 #define GRIDFORMAT_XML_PARSER_HPP_
9
10 #include <cmath>
11 #include <string>
12 #include <memory>
13 #include <istream>
14 #include <fstream>
15 #include <type_traits>
16 #include <unordered_map>
17 #include <functional>
18 #include <optional>
19 #include <concepts>
20 #include <limits>
21
22 #include <gridformat/common/exceptions.hpp>
23 #include <gridformat/common/istream_helper.hpp>
24 #include <gridformat/xml/element.hpp>
25 #include <gridformat/xml/tag.hpp>
26
27 namespace GridFormat {
28
29
30 /*!
31 * \ingroup XML
32 * \brief Parses an XML file into an XMLElement.
33 * \note Discards any comments.
34 * \note Creates a single root element in which the parsed elements are placed.
35 * \note The XML element contents are not read. Instead, their bounds within the input stream
36 * are stored separately and the content can be retrieved via get_content(const XMLElement&).
37 * \note Content inside XML elements is assumed to be either before or after child elements. If multiple
38 pieces of content are intermingled with child elements, only the first piece of content will be detected.
39 * \note This implementation is not a fully-fleshed XML parser, but suffices for our requirements.
40 * It is likely to fail when textual content that can be mistaken for xml is inside the elements.
41 */
42 class XMLParser {
43 public:
44 using ContentSkipFunction = std::function<bool(const XMLElement&)>;
45
46 struct StreamBounds {
47 std::streamsize begin_pos;
48 std::streamsize end_pos;
49 };
50
51 /*!
52 * \brief Parse an xml tree from the data in the file with the given name.
53 * \param filename The name of the xml file.
54 * \param root_name The name of the root element in which to place the read xml (default: "ROOT")
55 * \param skip_content_parsing A function that takes an xml element and returns true if the content
56 * of that element should not be parsed for child nodes. This is useful
57 * if the content of an element is very large and potentially invalid xml.
58 */
59 1937 explicit XMLParser(const std::string& filename,
60 const std::string& root_name = "ROOT",
61 const ContentSkipFunction& skip_content_parsing = [] (const XMLElement&) { return false; })
62 1937 : _owned{std::make_unique<std::ifstream>()}
63 3874 , _helper{*_owned}
64
1/2
✓ Branch 1 taken 1937 times.
✗ Branch 2 not taken.
1937 , _element{root_name}
65
1/2
✓ Branch 1 taken 1937 times.
✗ Branch 2 not taken.
1937 , _skip_content{skip_content_parsing} {
66
1/2
✓ Branch 2 taken 1937 times.
✗ Branch 3 not taken.
1937 _owned->open(filename);
67
4/6
✓ Branch 1 taken 3874 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 3874 times.
✗ Branch 5 not taken.
✓ Branch 8 taken 1937 times.
✓ Branch 9 taken 1937 times.
9685 while (_parse_next_element(_element)) {}
68 1937 }
69
70 //! Overload for reading from an existing stream
71 1 explicit XMLParser(std::istream& stream,
72 const std::string& root_name = "ROOT",
73 5 const ContentSkipFunction& skip_content_parsing = [] (const XMLElement&) { return false; })
74 1 : _owned{}
75 2 , _helper{stream}
76
1/2
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
1 , _element{root_name}
77
1/2
✓ Branch 1 taken 1 times.
✗ Branch 2 not taken.
1 , _skip_content{skip_content_parsing} {
78
4/6
✓ Branch 1 taken 2 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 2 times.
✗ Branch 5 not taken.
✓ Branch 8 taken 1 times.
✓ Branch 9 taken 1 times.
5 while (_parse_next_element(_element)) {}
79 1 }
80
81 //! Return a reference the read xml representation
82 103390 const XMLElement& get_xml() const & {
83 103390 return _element;
84 }
85
86 //! Return the read xml representation as an rvalue
87 XMLElement&& get_xml() && {
88 return std::move(_element);
89 }
90
91 //! Return true if a content was read for the given xml element
92 8 bool has_content(const XMLElement& e) const {
93
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 return _content_bounds.contains(&e);
94 }
95
96 //! Return the stream bounds for the content of the given xml element
97 10690 const StreamBounds& get_content_bounds(const XMLElement& e) const {
98
1/2
✓ Branch 1 taken 10690 times.
✗ Branch 2 not taken.
10690 return _content_bounds.at(&e);
99 }
100
101 //! Read and return the content of the given xml element
102 8 std::string read_content_for(const XMLElement& e, const std::optional<std::size_t> max_chars = {}) {
103
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 const auto& bounds = _content_bounds.at(&e);
104 8 const auto content_size = bounds.end_pos - bounds.begin_pos;
105 8 const auto num_chars = std::min(static_cast<std::size_t>(content_size), max_chars.value_or(content_size));
106
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
8 _helper.seek_position(bounds.begin_pos);
107
1/2
✓ Branch 1 taken 8 times.
✗ Branch 2 not taken.
16 return _helper.read_chunk(num_chars);
108 }
109
110 private:
111 // parse the content or child elements from the stream and add them to the given parent element
112 15706 void _parse_content(XMLElement& parent) {
113
1/2
✓ Branch 2 taken 15706 times.
✗ Branch 3 not taken.
15706 const std::string close_tag = "</" + parent.name();
114
1/2
✓ Branch 1 taken 15706 times.
✗ Branch 2 not taken.
15706 auto content_begin_pos = _helper.position();
115 15706 auto content_end_pos = content_begin_pos;
116
117
3/4
✓ Branch 1 taken 15706 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 1086 times.
✓ Branch 4 taken 14620 times.
15706 if (_skip_content(parent)) {
118
2/4
✓ Branch 1 taken 1086 times.
✗ Branch 2 not taken.
✗ Branch 3 not taken.
✓ Branch 4 taken 1086 times.
1086 if (!_helper.shift_until_substr(close_tag))
119 throw IOError("Could not find closing tag: " + close_tag);
120
1/2
✓ Branch 1 taken 1086 times.
✗ Branch 2 not taken.
1086 content_end_pos = _helper.position();
121
1/2
✓ Branch 2 taken 1086 times.
✗ Branch 3 not taken.
1086 _helper.shift_by(close_tag.size());
122 } else {
123 // check for content before the first child
124
3/6
✓ Branch 2 taken 14620 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 14620 times.
✗ Branch 6 not taken.
✗ Branch 8 not taken.
✓ Branch 9 taken 14620 times.
43860 if (!_helper.shift_until_any_of("<"))
125 throw IOError("Could not find closing tag for '" + parent.name() + "'");
126
1/2
✓ Branch 1 taken 14620 times.
✗ Branch 2 not taken.
14620 content_end_pos = _helper.position();
127
1/2
✓ Branch 1 taken 14620 times.
✗ Branch 2 not taken.
14620 _helper.seek_position(content_begin_pos);
128
1/2
✓ Branch 1 taken 14620 times.
✗ Branch 2 not taken.
14620 _helper.shift_whitespace();
129
1/2
✓ Branch 1 taken 14620 times.
✗ Branch 2 not taken.
14620 const bool have_read_content = _helper.position() < content_end_pos;
130
1/2
✓ Branch 1 taken 14620 times.
✗ Branch 2 not taken.
14620 _helper.seek_position(content_end_pos);
131
132 // parse all children
133 14620 std::optional<std::streamsize> position_after_last_child;
134 while (true) {
135
1/2
✓ Branch 1 taken 50533 times.
✗ Branch 2 not taken.
50533 auto pos = _parse_next_element(parent, close_tag);
136
2/2
✓ Branch 1 taken 35913 times.
✓ Branch 2 taken 14620 times.
50533 if (pos) position_after_last_child = pos;
137 14620 else break;
138 35913 }
139
140 // (maybe) check for content after the children
141
6/6
✓ Branch 0 taken 12923 times.
✓ Branch 1 taken 1697 times.
✓ Branch 3 taken 12403 times.
✓ Branch 4 taken 520 times.
✓ Branch 5 taken 12403 times.
✓ Branch 6 taken 2217 times.
14620 if (!have_read_content && position_after_last_child.has_value()) {
142
1/2
✓ Branch 1 taken 12403 times.
✗ Branch 2 not taken.
12403 content_begin_pos = position_after_last_child.value();
143
1/2
✓ Branch 1 taken 12403 times.
✗ Branch 2 not taken.
12403 content_end_pos = _helper.position();
144 }
145
146
2/4
✓ Branch 2 taken 14620 times.
✗ Branch 3 not taken.
✗ Branch 6 not taken.
✓ Branch 7 taken 14620 times.
14620 if (_helper.read_chunk(close_tag.size()) != close_tag)
147 throw IOError("Could not find closing tag for '" + parent.name() + "'");
148
3/6
✓ Branch 2 taken 14620 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 14620 times.
✗ Branch 6 not taken.
✗ Branch 8 not taken.
✓ Branch 9 taken 14620 times.
43860 if (!_helper.shift_until_any_of(">"))
149 throw IOError("Could not find closing tag for '" + parent.name() + "'");
150
2/4
✓ Branch 1 taken 14620 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 14620 times.
✗ Branch 4 not taken.
14620 if (!_helper.is_end_of_file())
151
1/2
✓ Branch 1 taken 14620 times.
✗ Branch 2 not taken.
14620 _helper.shift_by(1);
152 }
153
154
1/2
✓ Branch 1 taken 15706 times.
✗ Branch 2 not taken.
15706 _content_bounds[&parent] = StreamBounds{.begin_pos = content_begin_pos, .end_pos = content_end_pos};
155 15706 }
156
157 // parse the next child element from the stream and return the position after it (or none if no child found)
158 54409 std::optional<std::streamsize> _parse_next_element(XMLElement& parent, const std::string& close_tag = "") {
159 while (true) {
160
3/4
✓ Branch 1 taken 56137 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 1512 times.
✓ Branch 4 taken 54625 times.
56137 if (_helper.is_end_of_file()) {
161
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 1512 times.
1512 if (!close_tag.empty())
162 throw IOError("Did not find closing tag: " + close_tag);
163 1512 return {};
164 }
165
166
4/6
✓ Branch 2 taken 54625 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 54625 times.
✗ Branch 6 not taken.
✓ Branch 8 taken 426 times.
✓ Branch 9 taken 54199 times.
163875 if (!_helper.shift_until_any_of("<")) {
167
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 426 times.
426 if (!close_tag.empty())
168 throw IOError("Did not find closing tag: " + close_tag);
169 426 return {};
170 }
171
172
1/2
✓ Branch 1 taken 54199 times.
✗ Branch 2 not taken.
54199 const auto cur_pos = _helper.position();
173
1/2
✓ Branch 1 taken 54199 times.
✗ Branch 2 not taken.
54199 const auto chunk = _helper.read_chunk(4);
174
2/2
✓ Branch 1 taken 1725 times.
✓ Branch 2 taken 52474 times.
54199 if (chunk.starts_with("<?"))
175 1725 continue;
176
177
2/2
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 52471 times.
52474 if (chunk.starts_with("<!--")) {
178
1/2
✓ Branch 1 taken 3 times.
✗ Branch 2 not taken.
3 _helper.seek_position(cur_pos);
179
1/2
✓ Branch 1 taken 3 times.
✗ Branch 2 not taken.
3 _skip_comment();
180 3 continue;
181 }
182
183
1/2
✓ Branch 1 taken 52471 times.
✗ Branch 2 not taken.
52471 _helper.seek_position(cur_pos);
184
2/2
✓ Branch 1 taken 50533 times.
✓ Branch 2 taken 1938 times.
52471 if (!close_tag.empty()) {
185
3/4
✓ Branch 2 taken 50533 times.
✗ Branch 3 not taken.
✓ Branch 6 taken 14620 times.
✓ Branch 7 taken 35913 times.
50533 if (_helper.read_chunk(close_tag.size()) == close_tag) {
186
1/2
✓ Branch 1 taken 14620 times.
✗ Branch 2 not taken.
14620 _helper.seek_position(cur_pos);
187 14620 return {};
188 }
189
1/2
✓ Branch 1 taken 35913 times.
✗ Branch 2 not taken.
35913 _helper.seek_position(cur_pos);
190 }
191
192
2/4
✓ Branch 1 taken 37851 times.
✗ Branch 2 not taken.
✓ Branch 3 taken 37851 times.
✗ Branch 4 not taken.
37851 if (_parse_element(parent))
193
1/2
✓ Branch 1 taken 37851 times.
✗ Branch 2 not taken.
37851 return {_helper.position()};
194
2/3
✗ Branch 1 not taken.
✓ Branch 2 taken 52471 times.
✓ Branch 3 taken 1728 times.
55927 }
195 }
196
197 // skip beyond an xml comment in the input stream
198 3 void _skip_comment() {
199 static constexpr auto comment_begin = "<!--";
200 static constexpr auto comment_end = "-->";
201 3 std::string comment_chunk;
202
203 7 const auto append_until_closing_brace = [&] () -> void {
204
3/6
✓ Branch 2 taken 7 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 7 times.
✗ Branch 6 not taken.
✓ Branch 8 taken 7 times.
✗ Branch 9 not taken.
14 comment_chunk += _helper.read_until_any_of(">");
205
1/2
✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.
7 if (!_helper.is_end_of_file())
206
2/4
✓ Branch 1 taken 7 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 7 times.
✗ Branch 5 not taken.
7 comment_chunk += _helper.read_chunk(1); // read the actual ">"
207 7 };
208
209 6 const auto count = [&] (const std::string& substr) {
210 6 std::size_t count = 0;
211 6 auto found_pos = comment_chunk.find(substr);
212
2/2
✓ Branch 0 taken 6 times.
✓ Branch 1 taken 6 times.
12 while (found_pos != std::string::npos) {
213 6 found_pos = comment_chunk.find(substr, found_pos + 1);
214 6 count++;
215 }
216 6 return count;
217 3 };
218
219
1/2
✓ Branch 1 taken 3 times.
✗ Branch 2 not taken.
3 append_until_closing_brace();
220
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 3 times.
3 if (!comment_chunk.starts_with(comment_begin))
221 throw ValueError("Stream is not at a comment start position");
222
223
15/26
✓ Branch 1 taken 3 times.
✓ Branch 2 taken 4 times.
✓ Branch 4 taken 3 times.
✗ Branch 5 not taken.
✓ Branch 8 taken 3 times.
✗ Branch 9 not taken.
✗ Branch 11 not taken.
✓ Branch 12 taken 3 times.
✓ Branch 13 taken 3 times.
✓ Branch 14 taken 4 times.
✓ Branch 16 taken 3 times.
✓ Branch 17 taken 4 times.
✓ Branch 18 taken 3 times.
✓ Branch 19 taken 4 times.
✓ Branch 21 taken 3 times.
✓ Branch 22 taken 4 times.
✓ Branch 23 taken 4 times.
✓ Branch 24 taken 3 times.
✗ Branch 25 not taken.
✗ Branch 26 not taken.
✗ Branch 28 not taken.
✗ Branch 29 not taken.
✗ Branch 30 not taken.
✗ Branch 31 not taken.
✗ Branch 33 not taken.
✗ Branch 34 not taken.
19 while (!comment_chunk.ends_with(comment_end) || count(comment_begin) != count(comment_end))
224
1/2
✓ Branch 1 taken 4 times.
✗ Branch 2 not taken.
4 append_until_closing_brace();
225 3 }
226
227 // try to parse a single element and return true/false if succeeded
228 37851 bool _parse_element(XMLElement& parent) {
229
1/2
✓ Branch 1 taken 37851 times.
✗ Branch 2 not taken.
37851 const auto begin_pos = _helper.position();
230
2/4
✓ Branch 1 taken 37851 times.
✗ Branch 2 not taken.
✗ Branch 5 not taken.
✓ Branch 6 taken 37851 times.
37851 if (!_helper.read_chunk(1).starts_with("<")) {
231 _helper.seek_position(begin_pos);
232 return false;
233 }
234
235
1/2
✓ Branch 1 taken 37851 times.
✗ Branch 2 not taken.
37851 _helper.seek_position(begin_pos);
236
1/2
✓ Branch 1 taken 37851 times.
✗ Branch 2 not taken.
37851 _helper.shift_by(1);
237
2/4
✓ Branch 2 taken 37851 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 37851 times.
✗ Branch 6 not taken.
75702 std::string name = _helper.read_until_any_of(" />");
238
1/2
✓ Branch 3 taken 37851 times.
✗ Branch 4 not taken.
37851 auto& element = parent.add_child(std::move(name));
239
240 while (true) {
241
2/4
✓ Branch 1 taken 170961 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 170961 times.
✗ Branch 5 not taken.
341922 _helper.shift_until_not_any_of(" \n");
242
1/2
✓ Branch 1 taken 170961 times.
✗ Branch 2 not taken.
170961 const auto cur_pos = _helper.position();
243
4/6
✓ Branch 1 taken 170961 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 170961 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 22145 times.
✓ Branch 8 taken 148816 times.
170961 if (_helper.read_chunk(2) == "/>")
244 22145 break;
245
246
1/2
✓ Branch 1 taken 148816 times.
✗ Branch 2 not taken.
148816 _helper.seek_position(cur_pos);
247
4/6
✓ Branch 1 taken 148816 times.
✗ Branch 2 not taken.
✓ Branch 4 taken 148816 times.
✗ Branch 5 not taken.
✓ Branch 7 taken 15706 times.
✓ Branch 8 taken 133110 times.
148816 if (_helper.read_chunk(1) == ">") {
248
1/2
✓ Branch 1 taken 15706 times.
✗ Branch 2 not taken.
15706 _parse_content(element);
249 15706 break;
250 }
251
252
1/2
✓ Branch 1 taken 133110 times.
✗ Branch 2 not taken.
133110 _helper.seek_position(cur_pos);
253
1/2
✓ Branch 1 taken 133110 times.
✗ Branch 2 not taken.
133110 auto [name, value] = _read_attribute();
254
1/2
✓ Branch 4 taken 133110 times.
✗ Branch 5 not taken.
133110 element.set_attribute(std::move(name), std::move(value));
255 133110 }
256
257 37851 return true;
258 37851 }
259
260 133110 std::pair<std::string, std::string> _read_attribute() {
261
2/4
✓ Branch 2 taken 133110 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 133110 times.
✗ Branch 6 not taken.
266220 std::string attr_name = _helper.read_until_any_of("= ");
262
1/2
✗ Branch 1 not taken.
✓ Branch 2 taken 133110 times.
133110 if (attr_name.empty())
263 throw IOError("Could not parse attribute name");
264
265
2/4
✓ Branch 2 taken 133110 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 133110 times.
✗ Branch 6 not taken.
266220 _helper.shift_until_any_of("\"");
266
1/2
✓ Branch 1 taken 133110 times.
✗ Branch 2 not taken.
133110 _helper.shift_by(1);
267
2/4
✓ Branch 2 taken 133110 times.
✗ Branch 3 not taken.
✓ Branch 5 taken 133110 times.
✗ Branch 6 not taken.
266220 std::string attr_value = _helper.read_until_any_of("\"");
268
1/2
✓ Branch 1 taken 133110 times.
✗ Branch 2 not taken.
133110 _helper.shift_by(1);
269
270 266220 return std::make_pair(std::move(attr_name), std::move(attr_value));
271 133110 }
272
273 std::unique_ptr<std::ifstream> _owned;
274 InputStreamHelper _helper;
275 XMLElement _element;
276 ContentSkipFunction _skip_content;
277 std::unordered_map<const XMLElement*, StreamBounds> _content_bounds;
278 };
279
280 } // end namespace GridFormat
281
282 #endif // GRIDFORMAT_XML_PARSER_HPP_
283