From c0beeb5f0b3285fd7b411859bd68d44b472ad034 Mon Sep 17 00:00:00 2001
From: Karen Arutyunov <karen@codesynthesis.com>
Date: Thu, 14 Apr 2016 16:20:59 +0300
Subject: Add timestamp from_string()

---
 butl/timestamp     |  64 ++++++++++--
 butl/timestamp.cxx | 291 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 345 insertions(+), 10 deletions(-)

(limited to 'butl')
diff --git a/butl/timestamp b/butl/timestamp
index 10090c5..90bca08 100644
--- a/butl/timestamp
+++ b/butl/timestamp
@@ -38,19 +38,22 @@ namespace butl
 
   // Generally-useful special values.
   //
-  const timestamp timestamp_unknown {duration {-1}};
-  const timestamp timestamp_nonexistent {duration {0}};
+  const timestamp timestamp_unknown = timestamp (duration (-1));
+  const timestamp timestamp_nonexistent = timestamp (duration (10));
 
-  // Human-readable representation. By default the timestamp is printed by
-  // localtime_r() in the local timezone, so tzset() from <time.h> should be
-  // called prior to using the corresponding operator or the to_stream()
-  // function (normally from main() or equivalent).
+  // Print human-readable representation of the timestamp.
+  //
+  // By default the timestamp is printed by localtime_r() in the local
+  // timezone, so tzset() from <time.h> should be called prior to using the
+  // corresponding operator or the to_stream() function (normally from main()
+  // or equivalent).
   //
   // The format argument in the to_stream() function is the put_time() format
   // string except that it also supports the nanoseconds conversion specifier
   // in the form %[<d>N] where <d> is the optional single delimiter character,
-  // for example '.'. If the nanoseconds part is 0, then it is not printed (nor
-  // the delimiter character).
+  // for example '.'. If the nanoseconds part is 0, then it is not printed
+  // (nor the delimiter character). Otherwise, if necessary, the nanoseconds
+  // part is padded to 9 characters with leading zeros.
   //
   // The special argument in the to_stream() function indicates whether the
   // special timestamp_unknown and timestamp_nonexistent values should be
@@ -87,6 +90,51 @@ namespace butl
 
   std::ostream&
   operator<< (std::ostream&, const duration&);
+
+  // Parse human-readable representation of the timestamp.
+  //
+  // The format argument is the strptime() format string except that it also
+  // supports the fraction of a second specifier in the form %[<d><f>], where
+  // <d> is the optional single delimiter character, for example '.', and <f>
+  // is one of the 'N', 'U', 'M' characters, denoting nanoseconds,
+  // microseconds and milliseconds, respectively.
+  //
+  // The delimiter <d> is mandatory. If no such character is encountered at
+  // the corresponding position of the input string, the function behaves as
+  // if no %[] specifier were provided. Only single %[] specifier in the
+  // format string is currently supported.
+  //
+  // If the delimiter is present, then it should be followed by 9 (N), 6 (U),
+  // or 3 (M) digit value padded with leading zeros if necessary.
+  //
+  // If the local argument is true, then the input is assume to be local time
+  // and the result is returned as local time as well. Otherwise, UCT is used
+  // in both cases.
+  //
+  // If the end argument is not NULL, then it points to the first character
+  // that was not parsed. Otherwise, throw invalid_argument in case of any
+  // unparsed characters.
+  //
+  // Throw std::system_error on input/format mismatch and underlying time
+  // conversion function failures.
+  //
+  // Note that internally from_string() calls strptime(), which behaves
+  // according to the process' C locale (set with std::setlocale()) and not
+  // the C++ locale (set with std::locale::global()). Meanwhile the behaviour
+  // can be affected by std::locale::global() as well, as it itself calls
+  // std::setlocale() for the locale with a name.
+  //
+  // Potential improvements:
+  //   - support %() version for non-optional component but with optional
+  //     delimiter
+  //   - ability to parse local, return UTC and vice-versa
+  //   - handle timezone parsing
+  //
+  timestamp
+  from_string (const char* input,
+               const char* format,
+               bool local,
+               const char** end = nullptr);
 };
 
 #endif // BUTL_TIMESTAMP
diff --git a/butl/timestamp.cxx b/butl/timestamp.cxx
index 6299ba3..f3966b7 100644
--- a/butl/timestamp.cxx
+++ b/butl/timestamp.cxx
@@ -4,13 +4,16 @@
 
 #include <butl/timestamp>
 
-#include <time.h>  // localtime_r(), gmtime_r()
+#include <time.h>  // localtime_r(), gmtime_r(), strptime(), timegm()
 #include <errno.h> // EINVAL
 
-#include <ctime>        // tm, strftime()
+#include <ctime>        // tm, time_t, strftime(), mktime()
+#include <cstdlib>      // strtoull()
+#include <cassert>
 #include <iomanip>      // put_time(), setw(), dec, right
 #include <cstring>      // strlen(), memcpy()
 #include <ostream>
+#include <utility>      // pair, make_pair()
 #include <stdexcept>    // runtime_error
 #include <system_error>
 
@@ -28,6 +31,9 @@ using namespace std;
 // of the std::tm argument.
 //
 #ifdef __GLIBCXX__
+#include <ctime>   // tm, strftime()
+#include <ostream>
+
 namespace details
 {
   struct put_time_data
@@ -253,3 +259,284 @@ namespace butl
     return os;
   }
 }
+
+// VC++ implementation of strptime() via std::get_time().
+//
+// To debug fallback functions with GCC, uncomment the following defines.
+//
+//#define _MSC_VER
+//#define strptime strptime_
+//#define timegm   timegm_
+
+#ifdef _MSC_VER
+#include <ctime>   // time_t, tm, mktime(), gmtime()
+#include <locale>
+#include <clocale>
+#include <sstream>
+#include <iomanip>
+#include <cstring> // strlen()
+
+namespace details
+{
+  static char*
+  strptime (const char* input, const char* format, tm* time)
+  {
+    istringstream is (input);
+
+    // The original strptime() function behaves according to the process' C
+    // locale (set with std::setlocale()), which can differ from the process
+    // C++ locale (set with std::locale::global()).
+    //
+    is.imbue (locale (setlocale (LC_ALL, nullptr)));
+
+    if (!(is >> get_time (time, format)))
+      return nullptr;
+    else
+      // tellg () behaves as UnformattedInputFunction, so returns failure
+      // status if eofbit is set.
+      //
+      return const_cast<char*> (
+        input + (is.eof ()
+                 ? strlen (input)
+                 : static_cast<size_t> (is.tellg ())));
+  }
+
+  static time_t
+  timegm (tm* ctm)
+  {
+    const time_t e (static_cast<time_t> (-1));
+
+    // We will use an example to explain how it works. Say *ctm contains 9 AM
+    // of some day. Note that no time zone information is available.
+    //
+    // Convert it to the time from Epoch as if it's in the local time zone.
+    //
+    ctm->tm_isdst = -1;
+    time_t t (mktime (ctm));
+    if (t == e)
+      return e;
+
+    // Let's say we are in Moscow, and t contains the time passed from Epoch
+    // till 9 AM MSK. But that is not what we need. What we need is the time
+    // passed from Epoch till 9 AM GMT. This is some bigger number, as it takes
+    // longer to achieve the same calendar time for more Western location. So
+    // we need to find that offset, and increment t with it to obtain the
+    // desired value. The offset is effectively the time difference between MSK
+    // and GMT time zones.
+    //
+    tm gtm;
+    if (gmtime_r (&t, &gtm) == nullptr)
+      return e;
+
+    // gmtime_r() being called for the timepoint t returns 6 AM. So now we
+    // have *ctm and gtm, which value difference (3 hours) reflects the
+    // desired offset. The only problem is that we can not deduct gtm from
+    // *ctm, to get the offset expressed as time_t. To do that we need to apply
+    // to both of them the same conversion function transforming std::tm to
+    // std::time_t. The mktime() can do that, so the expression (mktime(ctm) -
+    // mktime(&gtm)) calculates the desired offset.
+    //
+    // To ensure mktime() works exactly the same way for both cases, we need
+    // to reset Daylight Saving Time flag for each of *ctm and gtm.
+    //
+    ctm->tm_isdst = 0;
+    time_t lt (mktime (ctm));
+    if (lt == e)
+      return e;
+
+    gtm.tm_isdst = 0;
+    time_t gt (mktime (&gtm));
+    if (gt == e)
+      return e;
+
+    // C11 standard specifies time_t to be a real type (integer and real
+    // floating types are collectively called real types). So we can not
+    // consider it to be signed.
+    //
+    return lt > gt ? t + (lt - gt) : t - (gt - lt);
+  }
+}
+
+using namespace details;
+#endif
+
+namespace butl
+{
+  static pair<tm, chrono::nanoseconds>
+  from_string (const char* input, const char* format, const char** end)
+  {
+    auto bad_val = []() {throw system_error (EINVAL, system_category ());};
+
+    // See if we have our specifier.
+    //
+    size_t i (0);
+    size_t n (strlen (format));
+    for (; i != n; ++i)
+    {
+      if (format[i] == '%' && i + 1 != n)
+      {
+        if (format[i + 1] == '[')
+          break;
+        else
+          ++i; // To handle %%.
+      }
+    }
+
+    // Call the fraction of a second as just fraction from now on.
+    //
+    using namespace chrono;
+    nanoseconds ns (nanoseconds::zero ());
+
+    if (i == n)
+    {
+      // No %[], so just parse with strptime().
+      //
+      tm t {};
+      const char* p (strptime (input, format, &t));
+      if (p == nullptr)
+        bad_val ();
+
+      if (end != nullptr)
+        *end = p;
+      else if (*p != '\0')
+        bad_val (); // Input is not fully read.
+
+      return make_pair (t, ns);
+    }
+
+    // Now the overall plan is:
+    //
+    // 1. Parse the fraction part of the input string to obtain nanoseconds.
+    //
+    // 2. Remove fraction part from the input string.
+    //
+    // 3. Remove %[] from the format string.
+    //
+    // 4. Re-parse the modified input with the modified format to fill the
+    //    std::tm structure.
+    //
+    // Parse the %[] specifier.
+    //
+    assert (format[i] == '%');
+    string fm (format, i++); // Start assembling the new format string.
+
+    assert (format[i] == '[');
+    if (++i == n)
+      bad_val ();
+
+    char d (format[i]); // Delimiter character.
+    if (++i == n)
+      bad_val ();
+
+    char f (format[i]); // Fraction specifier character.
+    if ((f != 'N' && f != 'U' && f != 'M') || ++i == n)
+      bad_val ();
+
+    if (format[i++] != ']')
+      bad_val ();
+
+    // Parse the input with the initial part of the format string, the one
+    // that preceeds the %[] specifier. The returned pointer will be the
+    // position we need to start from to parse the fraction.
+    //
+    tm t {};
+
+    // What if %[] is first, there is nothing before it? According to the
+    // strptime() documentation an empty format string is a valid one.
+    //
+    const char* p (strptime (input, fm.c_str (), &t));
+    if (p == nullptr)
+      bad_val ();
+
+    // Start assembling the new input string.
+    //
+    string in (input, p - input);
+    size_t fn (0); // Fraction size.
+
+    if (d == *p)
+    {
+      // Fraction present in the input.
+      //
+
+      // Read fraction digits.
+      //
+      char buf [10];
+      size_t i (0);
+      size_t n (f == 'N' ? 9 : (f == 'U' ? 6 : 3));
+      for (++p; i < n && *p >= '0' && *p <= '9'; ++i, ++p)
+        buf[i] = *p;
+
+      if (i < n)
+        bad_val ();
+
+      buf[n] = '\0';
+      fn = n;
+
+      // Calculate nanoseconds.
+      //
+      char* e (nullptr);
+      unsigned long long t (strtoull (buf, &e, 10));
+      assert (e == buf + n);
+
+      switch (f)
+      {
+      case 'N': ns = nanoseconds (t); break;
+      case 'U': ns = microseconds (t); break;
+      case 'M': ns = milliseconds (t); break;
+      default: assert (false);
+      }
+
+      // Actually the idea to fully remove the fraction from the input string,
+      // and %[] from the format string, has a flaw. After the fraction removal
+      // the spaces around it will be "swallowed" with a single space in the
+      // format string. So, as an example, for the input:
+      //
+      // 2016-02-21 19:31:10 .384902285 GMT
+      //
+      // And the format:
+      //
+      // %Y-%m-%d %H:%M:%S %[.N]
+      //
+      // The unparsed tail of the input will be 'GMT' while expected to be
+      // ' GMT'. To fix that we will not remove, but replace the mentioned
+      // parts with some non-space character.
+      //
+      fm += '-';
+      in += '-';
+    }
+
+    fm += format + i;
+    in += p;
+
+    // Reparse the modified input with the modified format.
+    //
+    t = {};
+    const char* b (in.c_str ());
+    p = strptime (b, fm.c_str (), &t);
+
+    if (p == nullptr)
+      bad_val ();
+
+    if (end != nullptr)
+      *end = input + (p - b + fn);
+    else if (*p != '\0')
+      bad_val (); // Input is not fully read.
+
+    return make_pair (t, ns);
+  }
+
+  timestamp
+  from_string (const char* input,
+               const char* format,
+               bool local,
+               const char** end)
+  {
+    pair<tm, chrono::nanoseconds> t (from_string (input, format, end));
+
+    time_t time (local ? mktime (&t.first) : timegm (&t.first));
+    if (time == -1)
+      throw system_error (errno, system_category ());
+
+    return timestamp::clock::from_time_t (time) + t.second;
+  }
+}
-- 
cgit v1.1