From c0beeb5f0b3285fd7b411859bd68d44b472ad034 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Thu, 14 Apr 2016 16:20:59 +0300 Subject: Add timestamp from_string() --- butl/timestamp | 64 ++++++++++-- butl/timestamp.cxx | 291 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 345 insertions(+), 10 deletions(-) (limited to 'butl') diff --git a/butl/timestamp b/butl/timestamp index 10090c5..90bca08 100644 --- a/butl/timestamp +++ b/butl/timestamp @@ -38,19 +38,22 @@ namespace butl // Generally-useful special values. // - const timestamp timestamp_unknown {duration {-1}}; - const timestamp timestamp_nonexistent {duration {0}}; + const timestamp timestamp_unknown = timestamp (duration (-1)); + const timestamp timestamp_nonexistent = timestamp (duration (10)); - // Human-readable representation. By default the timestamp is printed by - // localtime_r() in the local timezone, so tzset() from should be - // called prior to using the corresponding operator or the to_stream() - // function (normally from main() or equivalent). + // Print human-readable representation of the timestamp. + // + // By default the timestamp is printed by localtime_r() in the local + // timezone, so tzset() from should be called prior to using the + // corresponding operator or the to_stream() function (normally from main() + // or equivalent). // // The format argument in the to_stream() function is the put_time() format // string except that it also supports the nanoseconds conversion specifier // in the form %[N] where is the optional single delimiter character, - // for example '.'. If the nanoseconds part is 0, then it is not printed (nor - // the delimiter character). + // for example '.'. If the nanoseconds part is 0, then it is not printed + // (nor the delimiter character). Otherwise, if necessary, the nanoseconds + // part is padded to 9 characters with leading zeros. // // The special argument in the to_stream() function indicates whether the // special timestamp_unknown and timestamp_nonexistent values should be @@ -87,6 +90,51 @@ namespace butl std::ostream& operator<< (std::ostream&, const duration&); + + // Parse human-readable representation of the timestamp. + // + // The format argument is the strptime() format string except that it also + // supports the fraction of a second specifier in the form %[], where + // is the optional single delimiter character, for example '.', and + // is one of the 'N', 'U', 'M' characters, denoting nanoseconds, + // microseconds and milliseconds, respectively. + // + // The delimiter is mandatory. If no such character is encountered at + // the corresponding position of the input string, the function behaves as + // if no %[] specifier were provided. Only single %[] specifier in the + // format string is currently supported. + // + // If the delimiter is present, then it should be followed by 9 (N), 6 (U), + // or 3 (M) digit value padded with leading zeros if necessary. + // + // If the local argument is true, then the input is assume to be local time + // and the result is returned as local time as well. Otherwise, UCT is used + // in both cases. + // + // If the end argument is not NULL, then it points to the first character + // that was not parsed. Otherwise, throw invalid_argument in case of any + // unparsed characters. + // + // Throw std::system_error on input/format mismatch and underlying time + // conversion function failures. + // + // Note that internally from_string() calls strptime(), which behaves + // according to the process' C locale (set with std::setlocale()) and not + // the C++ locale (set with std::locale::global()). Meanwhile the behaviour + // can be affected by std::locale::global() as well, as it itself calls + // std::setlocale() for the locale with a name. + // + // Potential improvements: + // - support %() version for non-optional component but with optional + // delimiter + // - ability to parse local, return UTC and vice-versa + // - handle timezone parsing + // + timestamp + from_string (const char* input, + const char* format, + bool local, + const char** end = nullptr); }; #endif // BUTL_TIMESTAMP diff --git a/butl/timestamp.cxx b/butl/timestamp.cxx index 6299ba3..f3966b7 100644 --- a/butl/timestamp.cxx +++ b/butl/timestamp.cxx @@ -4,13 +4,16 @@ #include -#include // localtime_r(), gmtime_r() +#include // localtime_r(), gmtime_r(), strptime(), timegm() #include // EINVAL -#include // tm, strftime() +#include // tm, time_t, strftime(), mktime() +#include // strtoull() +#include #include // put_time(), setw(), dec, right #include // strlen(), memcpy() #include +#include // pair, make_pair() #include // runtime_error #include @@ -28,6 +31,9 @@ using namespace std; // of the std::tm argument. // #ifdef __GLIBCXX__ +#include // tm, strftime() +#include + namespace details { struct put_time_data @@ -253,3 +259,284 @@ namespace butl return os; } } + +// VC++ implementation of strptime() via std::get_time(). +// +// To debug fallback functions with GCC, uncomment the following defines. +// +//#define _MSC_VER +//#define strptime strptime_ +//#define timegm timegm_ + +#ifdef _MSC_VER +#include // time_t, tm, mktime(), gmtime() +#include +#include +#include +#include +#include // strlen() + +namespace details +{ + static char* + strptime (const char* input, const char* format, tm* time) + { + istringstream is (input); + + // The original strptime() function behaves according to the process' C + // locale (set with std::setlocale()), which can differ from the process + // C++ locale (set with std::locale::global()). + // + is.imbue (locale (setlocale (LC_ALL, nullptr))); + + if (!(is >> get_time (time, format))) + return nullptr; + else + // tellg () behaves as UnformattedInputFunction, so returns failure + // status if eofbit is set. + // + return const_cast ( + input + (is.eof () + ? strlen (input) + : static_cast (is.tellg ()))); + } + + static time_t + timegm (tm* ctm) + { + const time_t e (static_cast (-1)); + + // We will use an example to explain how it works. Say *ctm contains 9 AM + // of some day. Note that no time zone information is available. + // + // Convert it to the time from Epoch as if it's in the local time zone. + // + ctm->tm_isdst = -1; + time_t t (mktime (ctm)); + if (t == e) + return e; + + // Let's say we are in Moscow, and t contains the time passed from Epoch + // till 9 AM MSK. But that is not what we need. What we need is the time + // passed from Epoch till 9 AM GMT. This is some bigger number, as it takes + // longer to achieve the same calendar time for more Western location. So + // we need to find that offset, and increment t with it to obtain the + // desired value. The offset is effectively the time difference between MSK + // and GMT time zones. + // + tm gtm; + if (gmtime_r (&t, >m) == nullptr) + return e; + + // gmtime_r() being called for the timepoint t returns 6 AM. So now we + // have *ctm and gtm, which value difference (3 hours) reflects the + // desired offset. The only problem is that we can not deduct gtm from + // *ctm, to get the offset expressed as time_t. To do that we need to apply + // to both of them the same conversion function transforming std::tm to + // std::time_t. The mktime() can do that, so the expression (mktime(ctm) - + // mktime(>m)) calculates the desired offset. + // + // To ensure mktime() works exactly the same way for both cases, we need + // to reset Daylight Saving Time flag for each of *ctm and gtm. + // + ctm->tm_isdst = 0; + time_t lt (mktime (ctm)); + if (lt == e) + return e; + + gtm.tm_isdst = 0; + time_t gt (mktime (>m)); + if (gt == e) + return e; + + // C11 standard specifies time_t to be a real type (integer and real + // floating types are collectively called real types). So we can not + // consider it to be signed. + // + return lt > gt ? t + (lt - gt) : t - (gt - lt); + } +} + +using namespace details; +#endif + +namespace butl +{ + static pair + from_string (const char* input, const char* format, const char** end) + { + auto bad_val = []() {throw system_error (EINVAL, system_category ());}; + + // See if we have our specifier. + // + size_t i (0); + size_t n (strlen (format)); + for (; i != n; ++i) + { + if (format[i] == '%' && i + 1 != n) + { + if (format[i + 1] == '[') + break; + else + ++i; // To handle %%. + } + } + + // Call the fraction of a second as just fraction from now on. + // + using namespace chrono; + nanoseconds ns (nanoseconds::zero ()); + + if (i == n) + { + // No %[], so just parse with strptime(). + // + tm t {}; + const char* p (strptime (input, format, &t)); + if (p == nullptr) + bad_val (); + + if (end != nullptr) + *end = p; + else if (*p != '\0') + bad_val (); // Input is not fully read. + + return make_pair (t, ns); + } + + // Now the overall plan is: + // + // 1. Parse the fraction part of the input string to obtain nanoseconds. + // + // 2. Remove fraction part from the input string. + // + // 3. Remove %[] from the format string. + // + // 4. Re-parse the modified input with the modified format to fill the + // std::tm structure. + // + // Parse the %[] specifier. + // + assert (format[i] == '%'); + string fm (format, i++); // Start assembling the new format string. + + assert (format[i] == '['); + if (++i == n) + bad_val (); + + char d (format[i]); // Delimiter character. + if (++i == n) + bad_val (); + + char f (format[i]); // Fraction specifier character. + if ((f != 'N' && f != 'U' && f != 'M') || ++i == n) + bad_val (); + + if (format[i++] != ']') + bad_val (); + + // Parse the input with the initial part of the format string, the one + // that preceeds the %[] specifier. The returned pointer will be the + // position we need to start from to parse the fraction. + // + tm t {}; + + // What if %[] is first, there is nothing before it? According to the + // strptime() documentation an empty format string is a valid one. + // + const char* p (strptime (input, fm.c_str (), &t)); + if (p == nullptr) + bad_val (); + + // Start assembling the new input string. + // + string in (input, p - input); + size_t fn (0); // Fraction size. + + if (d == *p) + { + // Fraction present in the input. + // + + // Read fraction digits. + // + char buf [10]; + size_t i (0); + size_t n (f == 'N' ? 9 : (f == 'U' ? 6 : 3)); + for (++p; i < n && *p >= '0' && *p <= '9'; ++i, ++p) + buf[i] = *p; + + if (i < n) + bad_val (); + + buf[n] = '\0'; + fn = n; + + // Calculate nanoseconds. + // + char* e (nullptr); + unsigned long long t (strtoull (buf, &e, 10)); + assert (e == buf + n); + + switch (f) + { + case 'N': ns = nanoseconds (t); break; + case 'U': ns = microseconds (t); break; + case 'M': ns = milliseconds (t); break; + default: assert (false); + } + + // Actually the idea to fully remove the fraction from the input string, + // and %[] from the format string, has a flaw. After the fraction removal + // the spaces around it will be "swallowed" with a single space in the + // format string. So, as an example, for the input: + // + // 2016-02-21 19:31:10 .384902285 GMT + // + // And the format: + // + // %Y-%m-%d %H:%M:%S %[.N] + // + // The unparsed tail of the input will be 'GMT' while expected to be + // ' GMT'. To fix that we will not remove, but replace the mentioned + // parts with some non-space character. + // + fm += '-'; + in += '-'; + } + + fm += format + i; + in += p; + + // Reparse the modified input with the modified format. + // + t = {}; + const char* b (in.c_str ()); + p = strptime (b, fm.c_str (), &t); + + if (p == nullptr) + bad_val (); + + if (end != nullptr) + *end = input + (p - b + fn); + else if (*p != '\0') + bad_val (); // Input is not fully read. + + return make_pair (t, ns); + } + + timestamp + from_string (const char* input, + const char* format, + bool local, + const char** end) + { + pair t (from_string (input, format, end)); + + time_t time (local ? mktime (&t.first) : timegm (&t.first)); + if (time == -1) + throw system_error (errno, system_category ()); + + return timestamp::clock::from_time_t (time) + t.second; + } +} -- cgit v1.1