From c0beeb5f0b3285fd7b411859bd68d44b472ad034 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Thu, 14 Apr 2016 16:20:59 +0300 Subject: Add timestamp from_string() --- NEWS | 1 + butl/timestamp | 64 ++++++++-- butl/timestamp.cxx | 291 ++++++++++++++++++++++++++++++++++++++++++++- tests/buildfile | 2 +- tests/timestamp/buildfile | 7 ++ tests/timestamp/driver.cxx | 159 +++++++++++++++++++++++++ 6 files changed, 513 insertions(+), 11 deletions(-) create mode 100644 tests/timestamp/buildfile create mode 100644 tests/timestamp/driver.cxx diff --git a/NEWS b/NEWS index 60026c3..2c5645d 100644 --- a/NEWS +++ b/NEWS @@ -13,6 +13,7 @@ Version 0.3.0 * Derive a target class from the target triplet for some targets. Currently the classes are: 'linux', 'macosx', 'freebsd', 'windows', and 'other'. + * Implement timestamp from_string(). Version 0.2.0 diff --git a/butl/timestamp b/butl/timestamp index 10090c5..90bca08 100644 --- a/butl/timestamp +++ b/butl/timestamp @@ -38,19 +38,22 @@ namespace butl // Generally-useful special values. // - const timestamp timestamp_unknown {duration {-1}}; - const timestamp timestamp_nonexistent {duration {0}}; + const timestamp timestamp_unknown = timestamp (duration (-1)); + const timestamp timestamp_nonexistent = timestamp (duration (10)); - // Human-readable representation. By default the timestamp is printed by - // localtime_r() in the local timezone, so tzset() from should be - // called prior to using the corresponding operator or the to_stream() - // function (normally from main() or equivalent). + // Print human-readable representation of the timestamp. + // + // By default the timestamp is printed by localtime_r() in the local + // timezone, so tzset() from should be called prior to using the + // corresponding operator or the to_stream() function (normally from main() + // or equivalent). // // The format argument in the to_stream() function is the put_time() format // string except that it also supports the nanoseconds conversion specifier // in the form %[N] where is the optional single delimiter character, - // for example '.'. If the nanoseconds part is 0, then it is not printed (nor - // the delimiter character). + // for example '.'. If the nanoseconds part is 0, then it is not printed + // (nor the delimiter character). Otherwise, if necessary, the nanoseconds + // part is padded to 9 characters with leading zeros. // // The special argument in the to_stream() function indicates whether the // special timestamp_unknown and timestamp_nonexistent values should be @@ -87,6 +90,51 @@ namespace butl std::ostream& operator<< (std::ostream&, const duration&); + + // Parse human-readable representation of the timestamp. + // + // The format argument is the strptime() format string except that it also + // supports the fraction of a second specifier in the form %[], where + // is the optional single delimiter character, for example '.', and + // is one of the 'N', 'U', 'M' characters, denoting nanoseconds, + // microseconds and milliseconds, respectively. + // + // The delimiter is mandatory. If no such character is encountered at + // the corresponding position of the input string, the function behaves as + // if no %[] specifier were provided. Only single %[] specifier in the + // format string is currently supported. + // + // If the delimiter is present, then it should be followed by 9 (N), 6 (U), + // or 3 (M) digit value padded with leading zeros if necessary. + // + // If the local argument is true, then the input is assume to be local time + // and the result is returned as local time as well. Otherwise, UCT is used + // in both cases. + // + // If the end argument is not NULL, then it points to the first character + // that was not parsed. Otherwise, throw invalid_argument in case of any + // unparsed characters. + // + // Throw std::system_error on input/format mismatch and underlying time + // conversion function failures. + // + // Note that internally from_string() calls strptime(), which behaves + // according to the process' C locale (set with std::setlocale()) and not + // the C++ locale (set with std::locale::global()). Meanwhile the behaviour + // can be affected by std::locale::global() as well, as it itself calls + // std::setlocale() for the locale with a name. + // + // Potential improvements: + // - support %() version for non-optional component but with optional + // delimiter + // - ability to parse local, return UTC and vice-versa + // - handle timezone parsing + // + timestamp + from_string (const char* input, + const char* format, + bool local, + const char** end = nullptr); }; #endif // BUTL_TIMESTAMP diff --git a/butl/timestamp.cxx b/butl/timestamp.cxx index 6299ba3..f3966b7 100644 --- a/butl/timestamp.cxx +++ b/butl/timestamp.cxx @@ -4,13 +4,16 @@ #include -#include // localtime_r(), gmtime_r() +#include // localtime_r(), gmtime_r(), strptime(), timegm() #include // EINVAL -#include // tm, strftime() +#include // tm, time_t, strftime(), mktime() +#include // strtoull() +#include #include // put_time(), setw(), dec, right #include // strlen(), memcpy() #include +#include // pair, make_pair() #include // runtime_error #include @@ -28,6 +31,9 @@ using namespace std; // of the std::tm argument. // #ifdef __GLIBCXX__ +#include // tm, strftime() +#include + namespace details { struct put_time_data @@ -253,3 +259,284 @@ namespace butl return os; } } + +// VC++ implementation of strptime() via std::get_time(). +// +// To debug fallback functions with GCC, uncomment the following defines. +// +//#define _MSC_VER +//#define strptime strptime_ +//#define timegm timegm_ + +#ifdef _MSC_VER +#include // time_t, tm, mktime(), gmtime() +#include +#include +#include +#include +#include // strlen() + +namespace details +{ + static char* + strptime (const char* input, const char* format, tm* time) + { + istringstream is (input); + + // The original strptime() function behaves according to the process' C + // locale (set with std::setlocale()), which can differ from the process + // C++ locale (set with std::locale::global()). + // + is.imbue (locale (setlocale (LC_ALL, nullptr))); + + if (!(is >> get_time (time, format))) + return nullptr; + else + // tellg () behaves as UnformattedInputFunction, so returns failure + // status if eofbit is set. + // + return const_cast ( + input + (is.eof () + ? strlen (input) + : static_cast (is.tellg ()))); + } + + static time_t + timegm (tm* ctm) + { + const time_t e (static_cast (-1)); + + // We will use an example to explain how it works. Say *ctm contains 9 AM + // of some day. Note that no time zone information is available. + // + // Convert it to the time from Epoch as if it's in the local time zone. + // + ctm->tm_isdst = -1; + time_t t (mktime (ctm)); + if (t == e) + return e; + + // Let's say we are in Moscow, and t contains the time passed from Epoch + // till 9 AM MSK. But that is not what we need. What we need is the time + // passed from Epoch till 9 AM GMT. This is some bigger number, as it takes + // longer to achieve the same calendar time for more Western location. So + // we need to find that offset, and increment t with it to obtain the + // desired value. The offset is effectively the time difference between MSK + // and GMT time zones. + // + tm gtm; + if (gmtime_r (&t, >m) == nullptr) + return e; + + // gmtime_r() being called for the timepoint t returns 6 AM. So now we + // have *ctm and gtm, which value difference (3 hours) reflects the + // desired offset. The only problem is that we can not deduct gtm from + // *ctm, to get the offset expressed as time_t. To do that we need to apply + // to both of them the same conversion function transforming std::tm to + // std::time_t. The mktime() can do that, so the expression (mktime(ctm) - + // mktime(>m)) calculates the desired offset. + // + // To ensure mktime() works exactly the same way for both cases, we need + // to reset Daylight Saving Time flag for each of *ctm and gtm. + // + ctm->tm_isdst = 0; + time_t lt (mktime (ctm)); + if (lt == e) + return e; + + gtm.tm_isdst = 0; + time_t gt (mktime (>m)); + if (gt == e) + return e; + + // C11 standard specifies time_t to be a real type (integer and real + // floating types are collectively called real types). So we can not + // consider it to be signed. + // + return lt > gt ? t + (lt - gt) : t - (gt - lt); + } +} + +using namespace details; +#endif + +namespace butl +{ + static pair + from_string (const char* input, const char* format, const char** end) + { + auto bad_val = []() {throw system_error (EINVAL, system_category ());}; + + // See if we have our specifier. + // + size_t i (0); + size_t n (strlen (format)); + for (; i != n; ++i) + { + if (format[i] == '%' && i + 1 != n) + { + if (format[i + 1] == '[') + break; + else + ++i; // To handle %%. + } + } + + // Call the fraction of a second as just fraction from now on. + // + using namespace chrono; + nanoseconds ns (nanoseconds::zero ()); + + if (i == n) + { + // No %[], so just parse with strptime(). + // + tm t {}; + const char* p (strptime (input, format, &t)); + if (p == nullptr) + bad_val (); + + if (end != nullptr) + *end = p; + else if (*p != '\0') + bad_val (); // Input is not fully read. + + return make_pair (t, ns); + } + + // Now the overall plan is: + // + // 1. Parse the fraction part of the input string to obtain nanoseconds. + // + // 2. Remove fraction part from the input string. + // + // 3. Remove %[] from the format string. + // + // 4. Re-parse the modified input with the modified format to fill the + // std::tm structure. + // + // Parse the %[] specifier. + // + assert (format[i] == '%'); + string fm (format, i++); // Start assembling the new format string. + + assert (format[i] == '['); + if (++i == n) + bad_val (); + + char d (format[i]); // Delimiter character. + if (++i == n) + bad_val (); + + char f (format[i]); // Fraction specifier character. + if ((f != 'N' && f != 'U' && f != 'M') || ++i == n) + bad_val (); + + if (format[i++] != ']') + bad_val (); + + // Parse the input with the initial part of the format string, the one + // that preceeds the %[] specifier. The returned pointer will be the + // position we need to start from to parse the fraction. + // + tm t {}; + + // What if %[] is first, there is nothing before it? According to the + // strptime() documentation an empty format string is a valid one. + // + const char* p (strptime (input, fm.c_str (), &t)); + if (p == nullptr) + bad_val (); + + // Start assembling the new input string. + // + string in (input, p - input); + size_t fn (0); // Fraction size. + + if (d == *p) + { + // Fraction present in the input. + // + + // Read fraction digits. + // + char buf [10]; + size_t i (0); + size_t n (f == 'N' ? 9 : (f == 'U' ? 6 : 3)); + for (++p; i < n && *p >= '0' && *p <= '9'; ++i, ++p) + buf[i] = *p; + + if (i < n) + bad_val (); + + buf[n] = '\0'; + fn = n; + + // Calculate nanoseconds. + // + char* e (nullptr); + unsigned long long t (strtoull (buf, &e, 10)); + assert (e == buf + n); + + switch (f) + { + case 'N': ns = nanoseconds (t); break; + case 'U': ns = microseconds (t); break; + case 'M': ns = milliseconds (t); break; + default: assert (false); + } + + // Actually the idea to fully remove the fraction from the input string, + // and %[] from the format string, has a flaw. After the fraction removal + // the spaces around it will be "swallowed" with a single space in the + // format string. So, as an example, for the input: + // + // 2016-02-21 19:31:10 .384902285 GMT + // + // And the format: + // + // %Y-%m-%d %H:%M:%S %[.N] + // + // The unparsed tail of the input will be 'GMT' while expected to be + // ' GMT'. To fix that we will not remove, but replace the mentioned + // parts with some non-space character. + // + fm += '-'; + in += '-'; + } + + fm += format + i; + in += p; + + // Reparse the modified input with the modified format. + // + t = {}; + const char* b (in.c_str ()); + p = strptime (b, fm.c_str (), &t); + + if (p == nullptr) + bad_val (); + + if (end != nullptr) + *end = input + (p - b + fn); + else if (*p != '\0') + bad_val (); // Input is not fully read. + + return make_pair (t, ns); + } + + timestamp + from_string (const char* input, + const char* format, + bool local, + const char** end) + { + pair t (from_string (input, format, end)); + + time_t time (local ? mktime (&t.first) : timegm (&t.first)); + if (time == -1) + throw system_error (errno, system_category ()); + + return timestamp::clock::from_time_t (time) + t.second; + } +} diff --git a/tests/buildfile b/tests/buildfile index 0ad40e5..45c78d6 100644 --- a/tests/buildfile +++ b/tests/buildfile @@ -2,6 +2,6 @@ # copyright : Copyright (c) 2014-2016 Code Synthesis Ltd # license : MIT; see accompanying LICENSE file -d = dir-iterator/ path/ prefix-map/ sha256/ triplet/ +d = dir-iterator/ path/ prefix-map/ sha256/ timestamp/ triplet/ .: $d include $d diff --git a/tests/timestamp/buildfile b/tests/timestamp/buildfile new file mode 100644 index 0000000..bb565a2 --- /dev/null +++ b/tests/timestamp/buildfile @@ -0,0 +1,7 @@ +# file : tests/timestamp/buildfile +# copyright : Copyright (c) 2014-2016 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +exe{driver}: cxx{driver} ../../butl/lib{butl} + +include ../../butl/ diff --git a/tests/timestamp/driver.cxx b/tests/timestamp/driver.cxx new file mode 100644 index 0000000..2db726d --- /dev/null +++ b/tests/timestamp/driver.cxx @@ -0,0 +1,159 @@ +// file : tests/timestamp/driver.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include // tzset() + +#include +#include +#include +#include +#include +#include + +#include + +using namespace std; +using namespace butl; + +// Parse the input using the format string. Print the resulted time with the +// same format string, ensure the output matches the input. +// +static bool +parse (const char* in, const char* fmt, bool local, const char* out) +{ + if (out == nullptr) + out = in; + + try + { + const char* e; + timestamp t (from_string (in, fmt, local, &e)); + + ostringstream o; + if (!to_stream (o, t, fmt, false, local)) + return false; + + return o.str () + e == out; + } + catch (...) + { + return false; + } +} + +static bool +parse (const char* in, const char* fmt, const char* out = nullptr) +{ + return parse (in, fmt, true, out) && parse (in, fmt, false, out); +} + +static bool +fail (const char* in, const char* fmt) +{ + try + { + from_string (in, fmt, true); + return false; + } + catch (const system_error&) + { + return true; + } +} + +int +main () +{ + tzset (); // To use butl::to_stream() later on. + + // Invalid %[]. + // + assert (fail ("Apr 08 19:31:10 2016", "%b %d %H:%M:%S%[")); + assert (fail ("Apr 08 19:31:10 2016", "%b %d %H:%M:%S%[.")); + assert (fail ("Apr 08 19:31:10 2016", "%b %d %H:%M:%S%[.U")); + assert (fail ("Apr 08 19:31:10 2016", "%b %d %H:%M:%S%[.A]")); + assert (fail ("Apr 08 19:31:10 2016", "%d %H:%M:%S%[.U] %Y")); + assert (fail ("2016-10-20 11:12:13.123456789", "%Y-%m-%d %H:%M:%S%[N]")); + + // Invalid fraction of a second. + // + assert (fail ("Apr 08 19:31:10. 2016", "%b %d %H:%M:%S%[.U] %Y")); + assert (fail ("Apr 08 19:31:10.1 2016", "%b %d %H:%M:%S%[.M] %Y")); + assert (fail ("Apr 08 19:31:10.12 2016", "%b %d %H:%M:%S%[.M] %Y")); + assert (fail ("Apr 08 19:31:10.", "%b %d %H:%M:%S%[.U] %Y")); + assert (fail ("Apr 08 19:31:10.1", "%b %d %H:%M:%S%[.M] %Y")); + assert (fail ("Apr 08 19:31:10.12", "%b %d %H:%M:%S%[.M] %Y")); + + // Input is not fully parsed. + // + assert (fail ( + "Feb 21 19:31:10.123456789 2016 GMT", "%b %d %H:%M:%S%[.N] %Y")); + + // Invalid input (%[] unrelated). + // + assert (fail ("Apr 08 19:31:10.123456789 ABC", "%b %d %H:%M:%S%[.N] %Y")); + assert (fail ("Apr 19:31:10 2016", "%b %d %H:%M:%S %Y")); + assert (fail ("Opr 08 19:31:10 2016", "%b %d %H:%M:%S %Y")); + + // Parse valid input with a valid format. + // + assert (parse ( + "Apr 18 19:31:10 2016", "%b %d %H:%M:%S %Y", "Apr 18 19:31:10 2016")); + + assert (parse ("Apr 08 19:31:10 2016", "%b %d %H:%M:%S %Y")); + assert (parse ("2016-04-08 19:31:10", "%Y-%m-%d %H:%M:%S")); + + assert (parse ("ABC=Apr 18 19:31:10 2016 ABC", "ABC=%b %d %H:%M:%S %Y")); + assert (parse ("ABC=2016-04-08 19:31:10 ABC", "ABC=%Y-%m-%d %H:%M:%S")); + + assert (parse ("Feb 11 19:31:10 2016 GMT", "%b %d %H:%M:%S%[.N] %Y")); + assert (parse ("2016-02-11 19:31:10 GMT", "%Y-%m-%d %H:%M:%S%[.N]")); + + assert (parse ( + "Feb 21 19:31:10.384902285 2016 GMT", "%b %d %H:%M:%S%[.N] %Y")); + assert (parse ( + "2016-02-21 19:31:10.384902285 GMT", "%Y-%m-%d %H:%M:%S%[.N]")); + + assert (parse ( + "Feb 21 19:31:10 .384902285 2016 GMT", "%b %d %H:%M:%S %[.N] %Y")); + assert (parse ( + "2016-02-21 19:31:10 .384902285 GMT", "%Y-%m-%d %H:%M:%S %[.N]")); + + assert (parse ( + "2016-02-21 19:31:10 .384902285 GMT", + "%Y-%m-%d %H:%M:%S %[.N]", + "2016-02-21 19:31:10 .384902285 GMT")); + + assert (parse ( + "2016-02-21 19:31:10 .384902285 GMT", + "%Y-%m-%d %H:%M:%S %[.N]", + "2016-02-21 19:31:10 .384902285 GMT")); + + assert (parse ( + "Feb 21 19:31:10 .384902285NS 2016 GMT", "%b %d %H:%M:%S %[.N]NS %Y")); + assert (parse ( + "2016-02-21 19:31:10 .384902285NS GMT", "%Y-%m-%d %H:%M:%S %[.N]NS")); + + assert (parse ( + ".384902285 Feb 21 19:31:10 2016", "%[.N] %b %d %H:%M:%S %Y")); + assert (parse ( + ".384902285 2016-02-21 19:31:10", "%[.N] %Y-%m-%d %H:%M:%S")); + assert (parse ( + ".3849022852016-02-21 19:31:10", "%[.N]%Y-%m-%d %H:%M:%S")); + + setlocale (LC_ALL, "de_DE.utf-8"); + locale::global (locale ("de_DE.utf-8")); + assert (parse ("Mai 11 19:31:10 2016 GMT", "%b %d %H:%M:%S%[.N] %Y")); + locale::global (locale ("C")); + + // @@ When debuging strptime() fallback implementation compiled with GCC + // 5.3.1, the following asserts will fail due to bugs in implementation + // of std::get_time() manipulator. So need to be commented out. + // + assert (fail ("Apr 08 19:31:10 2016", "%b %d %H:%M:%S %Y %")); + assert (fail ("Apr 08 19:31:10", "%b %d %H:%M:%S %Y")); + + assert (parse ( + "Apr 8 19:31:10 2016", "%b %d %H:%M:%S %Y", "Apr 08 19:31:10 2016")); +} -- cgit v1.1