From 2d15efda5db161accd5f572fd4816885bce7c68c Mon Sep 17 00:00:00 2001
From: Karen Arutyunov <karen@codesynthesis.com>
Date: Thu, 23 Jun 2022 22:55:20 +0300
Subject: Split and merge manifest value/comment pair differently depending on
 whether it is multiline or not

---
 libbutl/manifest-parser.cxx     | 145 +++++++++++++++++++++++++++++++++-------
 libbutl/manifest-serializer.cxx |  91 +++++++++++++++++++++----
 2 files changed, 199 insertions(+), 37 deletions(-)

(limited to 'libbutl')

diff --git a/libbutl/manifest-parser.cxx b/libbutl/manifest-parser.cxx
index 258a536..904910a 100644
--- a/libbutl/manifest-parser.cxx
+++ b/libbutl/manifest-parser.cxx
@@ -148,41 +148,136 @@ namespace butl
   {
     using iterator = string::const_iterator;
 
-    auto space = [] (char c) -> bool {return c == ' ' || c == '\t';};
+    // Parse the value differently depending on whether it is multi-line or
+    // not.
+    //
+    if (v.find ('\n') == string::npos) // Single-line.
+    {
+      auto space = [] (char c) {return c == ' ' || c == '\t';};
 
-    iterator i (v.begin ());
-    iterator e (v.end ());
+      iterator i (v.begin ());
+      iterator e (v.end ());
 
-    string r;
-    size_t n (0);
-    for (char c; i != e && (c = *i) != ';'; ++i)
-    {
-      // Unescape ';' character.
+      string r;
+      size_t n (0);
+      for (char c; i != e && (c = *i) != ';'; ++i)
+      {
+        // Unescape ';' and '\' characters.
+        //
+        if (c == '\\' && i + 1 != e && (*(i + 1) == ';' || *(i + 1) == '\\'))
+          c = *++i;
+
+        r += c;
+
+        if (!space (c))
+          n = r.size ();
+      }
+
+      // Strip the value trailing spaces.
       //
-      if (c == '\\' && i + 1 != e && *(i + 1) == ';')
-        c = *++i;
+      if (r.size () != n)
+        r.resize (n);
 
-      r += c;
+      // Find beginning of a comment (i).
+      //
+      if (i != e)
+      {
+        // Skip spaces.
+        //
+        for (++i; i != e && space (*i); ++i);
+      }
 
-      if (!space (c))
-        n = r.size ();
+      return make_pair (move (r), string (i, e));
     }
+    else // Multi-line.
+    {
+      string r;
+      string c;
 
-    // Strip the value trailing spaces.
-    //
-    if (r.size () != n)
-      r.resize (n);
+      // Parse the value lines until the comment separator is encountered or
+      // the end of the value is reached. Add these lines to the resulting
+      // value, unescaping them if required.
+      //
+      // Note that we only need to unescape lines which have the '\+;' form.
+      //
+      auto i (v.begin ());
+      auto e (v.end ());
 
-    // Find beginning of a comment (i).
-    //
-    if (i != e)
-    {
-      // Skip spaces.
+      while (i != e)
+      {
+        // Find the end of the line and while at it the first non-backslash
+        // character.
+        //
+        auto le (i);
+        auto nb (e);
+        for (; le != e && *le != '\n'; ++le)
+        {
+          if (nb == e && *le != '\\')
+            nb = le;
+        }
+
+        // If the value end is not reached then position to the beginning of
+        // the next line and to the end of the value otherwise.
+        //
+        auto next = [&i, &le, &e] () {i = (le != e ? le + 1 : e);};
+
+        // If the first non-backslash character is ';' and it is the last
+        // character on the line, then this is either the comment separator or
+        // an escape sequence.
+        //
+        if (nb != e && *nb == ';' && nb + 1 == le)
+        {
+          // If ';' is the first (and thus the only) character on the line,
+          // then this is the comment separator and we bail out from this
+          // loop. Note that in this case we need to trim the trailing newline
+          // (but only one) from the resulting value since it is considered as
+          // a part of the separator.
+          //
+          if (nb == i)
+          {
+            if (!r.empty ())
+            {
+              assert (r.back () == '\n');
+              r.pop_back ();
+            }
+
+            next ();
+            break;
+          }
+          //
+          // Otherwise, this is an escape sequence, so unescape it. For that
+          // just take the rightmost half of the string:
+          //
+          // \;     -> ;
+          // \\;    -> \;
+          // \\\;   -> \;
+          // \\\\;  -> \\;
+          // \\\\\; -> \\;
+          //
+          else
+            i += (le - i) / 2;
+        }
+
+        // Add the line to the resulting value together with the trailing
+        // newline, if present.
+        //
+        r.append (i, le);
+
+        if (le != e)
+          r += '\n';
+
+        next ();
+      }
+
+      // If we haven't reached the end of the value then it means we've
+      // encountered the comment separator. In this case save the remaining
+      // value part as a comment.
       //
-      for (++i; i != e && space (*i); ++i);
-    }
+      if (i != e)
+        c = string (i, e);
 
-    return make_pair (move (r), string (i, e));
+      return make_pair (move (r), move (c));
+    }
   }
 
   void manifest_parser::
diff --git a/libbutl/manifest-serializer.cxx b/libbutl/manifest-serializer.cxx
index b0d0324..26699e0 100644
--- a/libbutl/manifest-serializer.cxx
+++ b/libbutl/manifest-serializer.cxx
@@ -101,22 +101,89 @@ namespace butl
   merge_comment (const string& value, const string& comment)
   {
     string r;
-    for (char c: value)
+
+    // Merge the value and comment differently depending on whether any of
+    // them is multi-line or not.
+    //
+    if (value.find ('\n') == string::npos && // Single-line.
+        comment.find ('\n') == string::npos)
     {
-      // Escape ';' character.
-      //
-      if (c == ';')
-        r += '\\';
+      for (char c: value)
+      {
+        // Escape ';' and '\' characters.
+        //
+        if (c == ';' || c == '\\')
+          r += '\\';
 
-      r += c;
-    }
+        r += c;
+      }
 
-    // Add the comment.
-    //
-    if (!comment.empty ())
+      // Add the comment.
+      //
+      if (!comment.empty ())
+      {
+        r += "; ";
+        r += comment;
+      }
+    }
+    else // Multi-line.
     {
-      r += "; ";
-      r += comment;
+      // Parse the value lines and add them to the resulting value, escaping
+      // them if required.
+      //
+      // Note that we only need to escape lines which have the '\*;' form.
+      //
+      for (auto i (value.begin ()), e (value.end ()); i != e; )
+      {
+        // Find the end of the line and while at it the first non-backslash
+        // character.
+        //
+        auto le (i);
+        auto nb (e);
+        for (; le != e && *le != '\n'; ++le)
+        {
+          if (nb == e && *le != '\\')
+            nb = le;
+        }
+
+        // If the first non-backslash character is ';' and it is the last
+        // character on the line, then we need to escape the line characters.
+        // Note that we only escape ';' if it is the only character on the
+        // line. Otherwise, we only escape backslashes doubling the number of
+        // them from the left:
+        //
+        // ;   -> \;
+        // \;  -> \\;
+        // \\; -> \\\\;
+        // \\\; -> \\\\\\;
+        //
+        if (nb != e && *nb == ';' && nb + 1 == le)
+          r.append (nb == i ? 1 : nb - i, '\\');
+
+        // Add the line to the resulting value together with the trailing
+        // newline, if present.
+        //
+        r.append (i, le);
+
+        if (le != e)
+          r += '\n';
+
+        // If the value end is not reached then position to the beginning of
+        // the next line and to the end of the value otherwise.
+        //
+        i = (le != e ? le + 1 : e);
+      }
+
+      // Append the comment, if present.
+      //
+      if (!comment.empty ())
+      {
+        if (!r.empty ())
+          r += '\n';
+
+        r += ";\n";
+        r += comment;
+      }
     }
 
     return r;
-- 
cgit v1.1