Kea  1.9.9-git
strutil.cc
Go to the documentation of this file.
1 // Copyright (C) 2011-2020 Internet Systems Consortium, Inc. ("ISC")
2 //
3 // This Source Code Form is subject to the terms of the Mozilla Public
4 // License, v. 2.0. If a copy of the MPL was not distributed with this
5 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
6 
7 #include <config.h>
8 
9 #include <util/encode/hex.h>
10 #include <util/strutil.h>
11 
12 #include <boost/algorithm/string/classification.hpp>
13 #include <boost/algorithm/string/constants.hpp>
14 #include <boost/algorithm/string/split.hpp>
15 
16 #include <numeric>
17 #include <iostream>
18 #include <sstream>
19 
20 // Early versions of C++11 regex were buggy, use it if we
21 // can otherwise, we fall back to regcomp/regexec. For more info see:
22 // https://stackoverflow.com/questions/12530406/is-gcc-4-8-or-earlier-buggy-about-regular-expressions
23 #ifdef USE_REGEX
24 #include <regex>
25 #else
26 #include <sys/types.h>
27 #include <regex.h>
28 #endif
29 
30 #include <string.h>
31 
32 using namespace std;
33 
34 namespace isc {
35 namespace util {
36 namespace str {
37 
38 // Normalize slashes
39 
40 void
41 normalizeSlash(std::string& name) {
42  if (!name.empty()) {
43  size_t pos = 0;
44  while ((pos = name.find('\\', pos)) != std::string::npos) {
45  name[pos] = '/';
46  }
47  }
48 }
49 
50 // Trim String
51 
52 string
53 trim(const string& instring) {
54  string retstring = "";
55  if (!instring.empty()) {
56  static const char* blanks = " \t\n";
57 
58  // Search for first non-blank character in the string
59  size_t first = instring.find_first_not_of(blanks);
60  if (first != string::npos) {
61 
62  // String not all blanks, so look for last character
63  size_t last = instring.find_last_not_of(blanks);
64 
65  // Extract the trimmed substring
66  retstring = instring.substr(first, (last - first + 1));
67  }
68  }
69 
70  return (retstring);
71 }
72 
73 // Tokenize string. As noted in the header, this is locally written to avoid
74 // another dependency on a Boost library.
75 
76 vector<string>
77 tokens(const std::string& text, const std::string& delim, bool escape) {
78  vector<string> result;
79  string token;
80  bool in_token = false;
81  bool escaped = false;
82  for (auto c = text.cbegin(); c != text.cend(); ++c) {
83  if (delim.find(*c) != string::npos) {
84  // Current character is a delimiter
85  if (!in_token) {
86  // Two or more delimiters, eat them
87  } else if (escaped) {
88  // Escaped delimiter in a token: reset escaped and keep it
89  escaped = false;
90  token.push_back(*c);
91  } else {
92  // End of the current token: save it if not empty
93  if (!token.empty()) {
94  result.push_back(token);
95  }
96  // Reset state
97  in_token = false;
98  token.clear();
99  }
100  } else if (escape && (*c == '\\')) {
101  // Current character is the escape character
102  if (!in_token) {
103  // The escape character is the first character of a new token
104  in_token = true;
105  }
106  if (escaped) {
107  // Escaped escape: reset escaped and keep one character
108  escaped = false;
109  token.push_back(*c);
110  } else {
111  // Remember to keep the next character
112  escaped = true;
113  }
114  } else {
115  // Not a delimiter nor an escape
116  if (!in_token) {
117  // First character of a new token
118  in_token = true;
119  }
120  if (escaped) {
121  // Escaped common character: as escape was false
122  escaped = false;
123  token.push_back('\\');
124  token.push_back(*c);
125  } else {
126  // The common case: keep it
127  token.push_back(*c);
128  }
129  }
130  }
131  // End of input: close and save the current token if not empty
132  if (escaped) {
133  // Pending escape
134  token.push_back('\\');
135  }
136  if (!token.empty()) {
137  result.push_back(token);
138  }
139 
140  return (result);
141 }
142 
143 // Local function to pass to accumulate() for summing up string lengths.
144 
145 namespace {
146 
147 size_t
148 lengthSum(string::size_type curlen, const string& cur_string) {
149  return (curlen + cur_string.size());
150 }
151 
152 }
153 
154 // Provide printf-style formatting.
155 
156 std::string
157 format(const std::string& format, const std::vector<std::string>& args) {
158 
159  static const string flag = "%s";
160 
161  // Initialize return string. To speed things up, we'll reserve an
162  // appropriate amount of space - current string size, plus length of all
163  // the argument strings, less two characters for each argument (the %s in
164  // the format string is being replaced).
165  string result;
166  size_t length = accumulate(args.begin(), args.end(), format.size(),
167  lengthSum) - (args.size() * flag.size());
168  result.reserve(length);
169 
170  // Iterate through replacing all tokens
171  result = format;
172  size_t tokenpos = 0; // Position of last token replaced
173  std::vector<std::string>::size_type i = 0; // Index into argument array
174 
175  while ((i < args.size()) && (tokenpos != string::npos)) {
176  tokenpos = result.find(flag, tokenpos);
177  if (tokenpos != string::npos) {
178  result.replace(tokenpos, flag.size(), args[i++]);
179  }
180  }
181 
182  return (result);
183 }
184 
185 std::string
186 getToken(std::istringstream& iss) {
187  string token;
188  iss >> token;
189  if (iss.bad() || iss.fail()) {
190  isc_throw(StringTokenError, "could not read token from string");
191  }
192  return (token);
193 }
194 
195 std::vector<uint8_t>
196 quotedStringToBinary(const std::string& quoted_string) {
197  std::vector<uint8_t> binary;
198  // Remove whitespace before and after the quotes.
199  std::string trimmed_string = trim(quoted_string);
200 
201  // We require two quote characters, so the length of the string must be
202  // equal to 2 at minimum, and it must start and end with quotes.
203  if ((trimmed_string.length() > 1) && ((trimmed_string[0] == '\'') &&
204  (trimmed_string[trimmed_string.length()-1] == '\''))) {
205  // Remove quotes and trim the text inside the quotes.
206  trimmed_string = trim(trimmed_string.substr(1, trimmed_string.length() - 2));
207  // Copy string contents into the vector.
208  binary.assign(trimmed_string.begin(), trimmed_string.end());
209  }
210  // Return resulting vector or empty vector.
211  return (binary);
212 }
213 
214 void
215 decodeColonSeparatedHexString(const std::string& hex_string,
216  std::vector<uint8_t>& binary) {
217  decodeSeparatedHexString(hex_string, ":", binary);
218 }
219 
220 void
221 decodeSeparatedHexString(const std::string& hex_string, const std::string& sep,
222  std::vector<uint8_t>& binary) {
223  std::vector<std::string> split_text;
224  boost::split(split_text, hex_string, boost::is_any_of(sep),
225  boost::algorithm::token_compress_off);
226 
227  std::vector<uint8_t> binary_vec;
228  for (size_t i = 0; i < split_text.size(); ++i) {
229 
230  // If there are multiple tokens and the current one is empty, it
231  // means that two consecutive colons were specified. This is not
232  // allowed.
233  if ((split_text.size() > 1) && split_text[i].empty()) {
234  isc_throw(isc::BadValue, "two consecutive separators ('" << sep << "') specified in"
235  " a decoded string '" << hex_string << "'");
236 
237  // Between a colon we expect at most two characters.
238  } else if (split_text[i].size() > 2) {
239  isc_throw(isc::BadValue, "invalid format of the decoded string"
240  << " '" << hex_string << "'");
241 
242  } else if (!split_text[i].empty()) {
243  std::stringstream s;
244  s << "0x";
245 
246  for (unsigned int j = 0; j < split_text[i].length(); ++j) {
247  // Check if we're dealing with hexadecimal digit.
248  if (!isxdigit(split_text[i][j])) {
249  isc_throw(isc::BadValue, "'" << split_text[i][j]
250  << "' is not a valid hexadecimal digit in"
251  << " decoded string '" << hex_string << "'");
252  }
253  s << split_text[i][j];
254  }
255 
256  // The stream should now have one or two hexadecimal digits.
257  // Let's convert it to a number and store in a temporary
258  // vector.
259  unsigned int binary_value;
260  s >> std::hex >> binary_value;
261 
262  binary_vec.push_back(static_cast<uint8_t>(binary_value));
263  }
264 
265  }
266 
267  // All ok, replace the data in the output vector with a result.
268  binary.swap(binary_vec);
269 }
270 
271 
272 void
273 decodeFormattedHexString(const std::string& hex_string,
274  std::vector<uint8_t>& binary) {
275  // If there is at least one colon we assume that the string
276  // comprises octets separated by colons (e.g. MAC address notation).
277  if (hex_string.find(':') != std::string::npos) {
278  decodeSeparatedHexString(hex_string, ":", binary);
279  } else if (hex_string.find(' ') != std::string::npos) {
280  decodeSeparatedHexString(hex_string, " ", binary);
281  } else {
282  std::ostringstream s;
283 
284  // If we have odd number of digits we'll have to prepend '0'.
285  if (hex_string.length() % 2 != 0) {
286  s << "0";
287  }
288 
289  // It is ok to use '0x' prefix in a string.
290  if ((hex_string.length() > 2) && (hex_string.substr(0, 2) == "0x")) {
291  // Exclude '0x' from the decoded string.
292  s << hex_string.substr(2);
293 
294  } else {
295  // No '0x', so decode the whole string.
296  s << hex_string;
297  }
298 
299  try {
300  // Decode the hex string.
301  encode::decodeHex(s.str(), binary);
302 
303  } catch (...) {
304  isc_throw(isc::BadValue, "'" << hex_string << "' is not a valid"
305  " string of hexadecimal digits");
306  }
307  }
308 }
309 
311 public:
312  StringSanitizerImpl(const std::string& char_set, const std::string& char_replacement)
313  : char_set_(char_set), char_replacement_(char_replacement) {
314  if (char_set.size() > StringSanitizer::MAX_DATA_SIZE) {
315  isc_throw(isc::BadValue, "char set size: '" << char_set.size()
316  << "' exceeds max size: '"
317  << StringSanitizer::MAX_DATA_SIZE << "'");
318  }
319 
320  if (char_replacement.size() > StringSanitizer::MAX_DATA_SIZE) {
321  isc_throw(isc::BadValue, "char replacement size: '"
322  << char_replacement.size() << "' exceeds max size: '"
323  << StringSanitizer::MAX_DATA_SIZE << "'");
324  }
325 #ifdef USE_REGEX
326  try {
327  scrub_exp_ = std::regex(char_set, std::regex::extended);
328  } catch (const std::exception& ex) {
329  isc_throw(isc::BadValue, "invalid regex: '"
330  << char_set_ << "', " << ex.what());
331  }
332 #else
333  int ec = regcomp(&scrub_exp_, char_set_.c_str(), REG_EXTENDED);
334  if (ec) {
335  char errbuf[512] = "";
336  static_cast<void>(regerror(ec, &scrub_exp_, errbuf, sizeof(errbuf)));
337  regfree(&scrub_exp_);
338  isc_throw(isc::BadValue, "invalid regex: '" << char_set_ << "', " << errbuf);
339  }
340 #endif
341  }
342 
345 #ifndef USE_REGEX
346  regfree(&scrub_exp_);
347 #endif
348  }
349 
350  std::string scrub(const std::string& original) {
351 #ifdef USE_REGEX
352  std::stringstream result;
353  try {
354  std::regex_replace(std::ostream_iterator<char>(result),
355  original.begin(), original.end(),
356  scrub_exp_, char_replacement_);
357  } catch (const std::exception& ex) {
358  isc_throw(isc::BadValue, "replacing '" << char_set_ << "' with '"
359  << char_replacement_ << "' in '" << original << "' failed: ,"
360  << ex.what());
361  }
362 
363  return (result.str());
364 #else
365  // In order to handle embedded nuls, we have to process in nul-terminated
366  // chunks. We iterate over the original data, doing pattern replacement
367  // on each chunk.
368  const char* orig_data = original.data();
369  const char* dead_end = orig_data + original.size();
370  const char* start_from = orig_data;
371  stringstream result;
372 
373  while (start_from < dead_end) {
374  // Iterate over original string, match by match.
375  regmatch_t matches[2]; // n matches + 1
376  const char* end_at = start_from + strlen(start_from);
377 
378  while (start_from < end_at) {
379  // Look for the next match
380  if (regexec(&scrub_exp_, start_from, 1, matches, 0) == REG_NOMATCH) {
381  // No matches, so add in the remainder
382  result << start_from;
383  start_from = end_at + 1;
384  break;
385  }
386 
387  // Shouldn't happen, but one never knows eh?
388  if (matches[0].rm_so == -1) {
389  isc_throw(isc::Unexpected, "matched but so is -1?");
390  }
391 
392  // Add everything from starting point up to the current match
393  const char* match_at = start_from + matches[0].rm_so;
394  while (start_from < match_at) {
395  result << *start_from;
396  ++start_from;
397  }
398 
399  // Add in the replacement
400  result << char_replacement_;
401 
402  // Move past the match.
403  ++start_from;
404  }
405 
406  // if we have an embedded nul, replace it and continue
407  if (start_from < dead_end) {
408  // Add in the replacement
409  result << char_replacement_;
410  start_from = end_at + 1;
411  }
412  }
413 
414  return (result.str());
415 #endif
416  }
417 
418 private:
420  std::string char_set_;
421 
423  std::string char_replacement_;
424 
425 #ifdef USE_REGEX
426  regex scrub_exp_;
427 #else
428  regex_t scrub_exp_;
429 #endif
430 };
431 
432 // @note The regex engine is implemented using recursion and can cause
433 // stack overflow if the input data is too large. An arbitrary size of
434 // 4096 should be enough for all cases.
435 const uint32_t StringSanitizer::MAX_DATA_SIZE = 4096;
436 
437 StringSanitizer::StringSanitizer(const std::string& char_set,
438  const std::string& char_replacement)
439  : impl_(new StringSanitizerImpl(char_set, char_replacement)) {
440 }
441 
443  delete impl_;
444 }
445 
446 std::string
447 StringSanitizer::scrub(const std::string& original) {
448  return (impl_->scrub(original));
449 }
450 
451 } // namespace str
452 } // namespace util
453 } // namespace isc
~StringSanitizer()
Destructor.
Definition: strutil.cc:442
STL namespace.
void decodeSeparatedHexString(const std::string &hex_string, const std::string &sep, std::vector< uint8_t > &binary)
Converts a string of separated hexadecimal digits into a vector.
Definition: strutil.cc:221
void decodeFormattedHexString(const std::string &hex_string, std::vector< uint8_t > &binary)
Converts a formatted string of hexadecimal digits into a vector.
Definition: strutil.cc:273
#define isc_throw(type, stream)
A shortcut macro to insert known values into exception arguments.
A generic exception that is thrown if a parameter given to a method is considered invalid in that con...
std::string scrub(const std::string &original)
Definition: strutil.cc:350
void decodeHex(const string &input, vector< uint8_t > &result)
Decode a text encoded in the base16 ('hex') format into the original data.
Definition: base_n.cc:474
A generic exception that is thrown when an unexpected error condition occurs.
std::string getToken(std::istringstream &iss)
Returns one token from the given stringstream.
Definition: strutil.cc:186
void normalizeSlash(std::string &name)
Normalize Backslash.
Definition: strutil.cc:41
vector< string > tokens(const std::string &text, const std::string &delim, bool escape)
Split String into Tokens.
Definition: strutil.cc:77
virtual const char * what() const
Returns a C-style character string of the cause of the exception.
A Set of C++ Utilities for Manipulating Strings.
Definition: strutil.h:30
std::vector< uint8_t > quotedStringToBinary(const std::string &quoted_string)
Converts a string in quotes into vector.
Definition: strutil.cc:196
Defines the logger used by the top-level component of kea-dhcp-ddns.
void decodeColonSeparatedHexString(const std::string &hex_string, std::vector< uint8_t > &binary)
Converts a string of hexadecimal digits with colons into a vector.
Definition: strutil.cc:215
std::string scrub(const std::string &original)
Returns a scrubbed copy of a given string.
Definition: strutil.cc:447
string trim(const string &instring)
Trim Leading and Trailing Spaces.
Definition: strutil.cc:53
StringSanitizerImpl(const std::string &char_set, const std::string &char_replacement)
Definition: strutil.cc:312
std::string format(const std::string &format, const std::vector< std::string > &args)
Apply Formatting.
Definition: strutil.cc:157