// file      : libbuild2/bin/def-rule.cxx -*- C++ -*-
// license   : MIT; see accompanying LICENSE file

#include <libbuild2/bin/def-rule.hxx>

#include <libbuild2/depdb.hxx>
#include <libbuild2/scope.hxx>
#include <libbuild2/target.hxx>
#include <libbuild2/algorithm.hxx>
#include <libbuild2/filesystem.hxx>
#include <libbuild2/diagnostics.hxx>

#include <libbuild2/bin/target.hxx>
#include <libbuild2/bin/utility.hxx>

namespace build2
{
  namespace bin
  {
    // In C global uninitialized data becomes a "common symbol" (an equivalent
    // definition compiled as C++ results in a BSS symbol) which allows some
    // archaic merging of multiple such definitions during linking (see GNU ld
    // --warn-common for background). Note that this merging may happen with
    // other data symbol types, not just common.
    //
    struct symbols
    {
      set<string> d; // data
      set<string> r; // read-only data
      set<string> b; // uninitialized data (BSS)
      set<string> c; // common uninitialized data
      set<string> t; // text (code)
    };

    static void
    read_dumpbin (istream& is, symbols& syms)
    {
      // Lines that describe symbols look like:
      //
      // 0   1        2      3          4            5 6
      // IDX OFFSET   SECT   SYMTYPE    VISIBILITY     SYMNAME
      // ----------------------------------------------------------------------
      // 02E 00000130 SECTA  notype     External     | _standbyState
      // 02F 00000009 SECT9  notype     Static       | _LocalRecoveryInProgress
      // 064 00000020 SECTC  notype ()  Static       | _XLogCheckBuffer
      // 065 00000000 UNDEF  notype ()  External     | _BufferGetTag
      //
      // IDX is the symbol index and OFFSET is its offset.
      //
      // SECT[ION] is the name of the section where the symbol is defined. If
      // UNDEF, then it's a symbol to be resolved at link time from another
      // object file.
      //
      // SYMTYPE is always notype for C/C++ symbols as there's no typeinfo and
      // no way to get the symbol type from name (de)mangling. However, we
      // care if "notype" is suffixed by "()" or not. The presence of () means
      // the symbol is a function, the absence means it isn't.
      //
      // VISIBILITY indicates whether it's a compilation-unit local static
      // symbol ("Static"), or whether it's available for use from other
      // compilation units ("External"). Note that there are other values,
      // such as "WeakExternal", and "Label".
      //
      // SYMNAME is the symbol name.
      //
      // The first symbol in each section appears to specify the section type,
      // for example:
      //
      // 006 00000000 SECT3  notype    Static       | .rdata
      // B44 00000000 SECT4  notype    Static       | .rdata$r
      // AA2 00000000 SECT5  notype    Static       | .bss
      //
      // Note that an UNDEF data symbol with non-zero OFFSET is a "common
      // symbol", equivalent to the nm `C` type.

      // Map of read-only (.rdata, .xdata) and uninitialized (.bss) sections
      // to their types (R and B, respectively). If a section is not found in
      // this map, then it's assumed to be normal data (.data).
      //
      map<string, char> sections;

      string l;
      while (!eof (getline (is, l)))
      {
        size_t b (0), e (0), n;

        // IDX (note that it can be more than 3 characters).
        //
        if (next_word (l, b, e) == 0)
          continue;

        // OFFSET (always 8 characters).
        //
        n = next_word (l, b, e);

        if (n != 8)
          continue;

        string off (l, b, n);

        // SECT
        //
        n = next_word (l, b, e);

        if (n == 0)
          continue;

        string sec (l, b, n);

        // TYPE
        //
        n = next_word (l, b, e);

        if (l.compare (b, n, "notype") != 0)
          continue;

        bool dat;
        if (l[e] == ' ' && l[e + 1] == '(' && l[e + 2] == ')')
        {
          e += 3;
          dat = false;
        }
        else
          dat = true;

        // VISIBILITY
        //
        n = next_word (l, b, e);

        if (n == 0)
          continue;

        string vis (l, b, n);

        // |
        //
        n = next_word (l, b, e);

        if (n != 1 || l[b] != '|')
          continue;

        // SYMNAME
        //
        n = next_word (l, b, e);

        if (n == 0)
          continue;

        string s (l, b, n);

        // See if this is the section type symbol.
        //
        if (dat &&
            off == "00000000" &&
            sec != "UNDEF"    &&
            vis == "Static"   &&
            s[0] == '.')
        {
          auto cmp = [&s] (const char* n, size_t l)
          {
            return s.compare (0, l, n) == 0 && (s[l] == '\0' || s[l] == '$');
          };

          if      (cmp (".rdata", 6) ||
                   cmp (".xdata", 6))    sections.emplace (move (sec), 'R');
          else if (cmp (".bss",   4))    sections.emplace (move (sec), 'B');

          continue;
        }

        // We can only export extern symbols.
        //
        if (vis != "External")
          continue;

        if (dat)
        {
          if (sec != "UNDEF")
          {
            auto i (sections.find (sec));
            switch (i == sections.end () ? 'D' : i->second)
            {
            case 'D': syms.d.insert (move (s)); break;
            case 'R': syms.r.insert (move (s)); break;
            case 'B': syms.b.insert (move (s)); break;
            }
          }
          else
          {
            if (off != "00000000")
              syms.c.insert (move (s));
          }
        }
        else
        {
          if (sec != "UNDEF")
            syms.t.insert (move (s));
        }
      }
    }

    static void
    read_posix_nm (istream& is, symbols& syms)
    {
      // Lines that describe symbols look like:
      //
      // <NAME> <TYPE> <VALUE> <SIZE>
      //
      // The types that we are interested in are T, D, R, and B.
      //
      string l;
      while (!eof (getline (is, l)))
      {
        size_t b (0), e (0), n;

        // NAME
        //
        n = next_word (l, b, e);

        if (n == 0)
          continue;

        string s (l, b, n);

        // TYPE
        //
        n = next_word (l, b, e);

        if (n != 1)
          continue;

        switch (l[b])
        {
        case 'D': syms.d.insert (move (s)); break;
        case 'R': syms.r.insert (move (s)); break;
        case 'B': syms.b.insert (move (s)); break;
        case 'c':
        case 'C': syms.c.insert (move (s)); break;
        case 'T': syms.t.insert (move (s)); break;
        }
      }
    }

    static void
    write_win32_msvc (ostream& os, const symbols& syms, bool i386)
    {
      // Our goal here is to export the same types of symbols as what gets
      // exported by MSVC with __declspec(dllexport) (can be viewed with
      // dumpbin /EXPORTS).
      //
      // Some special C++ symbol patterns:
      //
      // Data symbols:
      //
      // ??_C* -- string literal                      (R,   not exported)
      // ??_7* -- vtable                              (R,   exported)
      // ??_R* -- rtti, can be prefixed with _CT/__CT (D/R, not exported)
      //
      // Text symbols:
      //
      // ??_G* -- scalar deleting destructor (not exported)
      // ??_E* -- vector deleting destructor (not exported)
      //
      // The following two symbols seem to be related to exception
      // throwing and most likely should not be exported.
      //
      // R _CTA3?AVinvalid_argument@std@@
      // R _TI3?AVinvalid_argument@std@@
      //
      // There are also what appears to be floating point literals:
      //
      // R __real@3f80000
      //
      // For some reason i386 object files have extern "C" symbols (both
      // data and text) prefixed with an underscore which must be stripped
      // in the .def file.
      //
      // Note that the extra prefix seems to be also added to special
      // symbols so something like _CT??... becomes __CT??... on i386.
      // However, for such symbols the underscore shall not be removed.
      // Which means an extern "C" _CT becomes __CT on i383 and hard to
      // distinguish from the special symbols. We deal with this by only
      // stripping the underscore if the symbols doesn't contain any
      // special characters (?@).
      //
      auto extern_c = [] (const string& s)
      {
        return s.find_first_of ("?@") == string::npos;
      };

      auto strip = [i386, &extern_c] (const string& s) -> const char*
      {
        const char* r (s.c_str ());

        if (i386 && s[0] == '_' && extern_c (s))
          r++;

        return r;
      };

      // Code.
      //
      for (const string& s: syms.t)
      {
        auto filter = [&strip] (const string& s) -> const char*
        {
          if (s.compare (0, 4, "??_G") == 0 ||
              s.compare (0, 4, "??_E") == 0)
            return nullptr;

          return strip (s);
        };

        if (const char* v = filter (s))
          os << "  " << v << '\n';
      }

      // Data.
      //
      // Note that it's not easy to import data without a dllimport
      // declaration.
      //
      {
        auto filter = [&strip] (const string& s) -> const char*
        {
          if (s.compare (0, 4, "??_R") == 0 ||
              s.compare (0, 4, "??_C") == 0)
            return nullptr;

          return strip (s);
        };

        for (const string& s: syms.d)
          if (const char* v = filter (s))
            os << "  " << v << " DATA\n";

        for (const string& s: syms.b)
          if (const char* v = filter (s))
            os << "  " << v << " DATA\n";

        // For common symbols, only write extern C.
        //
        for (const string& s: syms.c)
          if (extern_c (s))
            if (const char* v = filter (s))
              os << "  " << v << " DATA\n";

        // Read-only data contains an especially large number of various
        // special symbols. Instead of trying to filter them out case by case,
        // we will try to recognize C/C++ identifiers plus the special symbols
        // that we need to export (e.g., vtable).
        //
        //
        for (const string& s: syms.r)
        {
          if (extern_c (s)                 || // C
              (s[0] == '?' && s[1] != '?') || // C++
              s.compare (0, 4, "??_7") == 0)  // vtable
          {
            os << "  " << strip (s) << " DATA\n";
          }
        }
      }
    }

    static void
    write_mingw32 (ostream& os, const symbols& syms, bool i386)
    {
      // Our goal here is to export the same types of symbols as what gets
      // exported by GCC with __declspec(dllexport) (can be viewed with
      // dumpbin /EXPORTS).
      //
      // Some special C++ symbol patterns (Itanium C++ ABI):
      //
      // Data symbols:
      //
      // _ZTVN* -- vtable          (R,   exported)
      // _ZTIN* -- typeinfo        (R,   exported)
      // _ZTSN* -- typeinfo name   (R, not exported)
      //
      // There are also some special R symbols which start with .refptr.
      // that are not exported.
      //
      // Normal symbols (both text and data) appear to start with _ZN.
      //
      // Note that we have the same extra underscore for i386 as in the
      // win32-msvc case above but here even for mangled symbols (e.g., __Z*).
      //
      auto skip = [i386] (const string& s) -> size_t
      {
        return i386 && s[0] == '_' ? 1 : 0;
      };

      // Code.
      //
      for (const string& s: syms.t)
      {
        auto filter = [&skip] (const string& s) -> const char*
        {
          return s.c_str () + skip (s);
        };

        if (const char* v = filter (s))
          os << "  " << v << '\n';
      }

      // Data.
      //
      {
        auto filter = [&skip] (const string& s) -> const char*
        {
          return s.c_str () + skip (s);
        };

        for (const string& s: syms.d)
          if (const char* v = filter (s))
            os << "  " << v << " DATA\n";

        for (const string& s: syms.b)
          if (const char* v = filter (s))
            os << "  " << v << " DATA\n";

        for (const string& s: syms.c)
          if (const char* v = filter (s))
            os << "  " << v << " DATA\n";

        // Read-only data contains an especially large number of various
        // special symbols. Instead of trying to filter them out case by case,
        // we will try to recognize C/C++ identifiers plus the special symbols
        // that we need to export (e.g., vtable and typeinfo).
        //
        for (const string& s: syms.r)
        {
          if (s.find_first_of (".") != string::npos) // Special (.refptr.*)
            continue;

          size_t p (skip (s)), n (s.size () - p);

          if ((n < 2 || s[p] != '_' || s[p + 1] != 'Z') ||  // C
              (s[p + 2] == 'N'                        ) ||  // C++ (normal)
              (s[p + 2] == 'T' && (s[p + 3] == 'V' ||       // vtable
                                   s[p + 3] == 'I') &&      // typeinfo
               s[p + 4] == 'N'))
          {
            os << "  " << s.c_str () + p << " DATA\n";
          }
        }
      }
    }

    bool def_rule::
    match (action a, target& t) const
    {
      tracer trace ("bin::def_rule::match");

      // See if we have an object file or a utility library.
      //
      for (prerequisite_member p: reverse_group_prerequisite_members (a, t))
      {
        // If excluded or ad hoc, then don't factor it into our tests.
        //
        if (include (a, t, p) != include_type::normal)
          continue;

        if (p.is_a<obj> ()   || p.is_a<objs> () ||
            p.is_a<bmi> ()   || p.is_a<bmis> () ||
            p.is_a<libul> () || p.is_a<libus> ())
          return true;
      }

      l4 ([&]{trace << "no object or utility library prerequisite for target "
                    << t;});
      return false;
    }

    recipe def_rule::
    apply (action a, target& xt) const
    {
      def& t (xt.as<def> ());

      t.derive_path ();

      // Inject dependency on the output directory.
      //
      inject_fsdir (a, t);

      // Match prerequisites only picking object files and utility libraries.
      //
      match_prerequisite_members (
        a,
        t,
        [] (action a,
            const target& t,
            const prerequisite_member& p,
            include_type i) -> prerequisite_target
        {
          return
            i == include_type::adhoc ? nullptr :
            //
            // If this is a target group, then pick the appropriate member
            // (the same semantics as what we have in link-rule).
            //
            p.is_a<obj> ()   ? &search (t, objs::static_type, p.key ()) :
            p.is_a<bmi> ()   ? &search (t, bmis::static_type, p.key ()) :
            p.is_a<libul> () ? link_member (p.search (t).as<libul> (),
                                            a,
                                            linfo {otype::s, lorder::s}) :
            p.is_a<objs> () ||
            p.is_a<bmis> () ||
            p.is_a<libus> () ? &p.search (t) : nullptr;
        });

      switch (a)
      {
      case perform_update_id: return &perform_update;
      case perform_clean_id:  return &perform_clean_depdb; // Standard clean.
      default:                return noop_recipe;          // Configure update.
      }
    }

    target_state def_rule::
    perform_update (action a, const target& xt)
    {
      tracer trace ("bin::def_rule::perform_update");

      const def& t (xt.as<def> ());
      const path& tp (t.path ());

      context& ctx (t.ctx);

      const scope& bs (t.base_scope ());
      const scope& rs (*bs.root_scope ());

      // For link.exe we use its /DUMP option to access dumpbin.exe. Otherwise
      // (lld-link, MinGW), we use nm (llvm-nm, MinGW nm). For good measure
      // (e.g., the bin.def module is loaded without bin.ld), we also handle
      // the direct dumpbin.exe usage.
      //
      const string& lid (cast_empty<string> (rs["bin.ld.id"]));

      // Update prerequisites and determine if anything changed.
      //
      timestamp mt (t.load_mtime ());
      optional<target_state> ts (execute_prerequisites (a, t, mt));

      bool update (!ts);

      // We use depdb to track changes to the input set, etc.
      //
      depdb dd (tp + ".d");

      // First should come the rule name/version.
      //
      if (dd.expect (rule_id_) != nullptr)
        l4 ([&]{trace << "rule mismatch forcing update of " << t;});

      // Then the nm checksum.
      //
      if (dd.expect (lid == "msvc"
                     ? cast<string> (rs["bin.ld.checksum"])
                     : cast<string> (rs["bin.nm.checksum"])) != nullptr)
        l4 ([&]{trace << "linker mismatch forcing update of " << t;});

      // @@ TODO: track in depdb if making symbol filtering configurable.

      // Collect and hash the list of object files seeing through libus{}.
      //
      vector<reference_wrapper<const objs>> os;
      {
        sha256 cs;

        auto collect = [a, &rs, &os, &cs] (const file& t,
                                           const auto& collect) -> void
        {
          for (const target* pt: t.prerequisite_targets[a])
          {
            if (pt == nullptr)
              continue;

            const objs* o;
            if ((o = pt->is_a<objs> ()) != nullptr)
              ;
            else if (pt->is_a<hbmi> ())
              o = find_adhoc_member<objs> (*pt);
            //
            // Note that in prerequisite targets we will have the libux{}
            // members, not the group.
            //
            else if (const libus* l = pt->is_a<libus> ())
            {
              collect (*l, collect);
              continue;
            }
            else
              continue;

            hash_path (cs, o->path (), rs.out_path ());
            os.push_back (*o);
          }
        };

        collect (t, collect);

        if (dd.expect (cs.string ()) != nullptr)
          l4 ([&]{trace << "file set mismatch forcing update of " << t;});
      }

      // Update if any mismatch or depdb is newer that the output.
      //
      if (dd.writing () || dd.mtime > mt)
        update = true;

      dd.close ();

      // If nothing changed, then we are done.
      //
      if (!update)
        return *ts;

      const process_path& nm (lid == "msvc"
                              ? cast<process_path> (rs["bin.ld.path"])
                              : cast<process_path> (rs["bin.nm.path"]));

      cstrings args {nm.recall_string ()};

      string nid;
      if (lid == "msvc")
      {
        args.push_back ("/DUMP"); // Must come first.
        args.push_back ("/NOLOGO");
        args.push_back ("/SYMBOLS");
      }
      else
      {
        nid = cast<string> (rs["bin.nm.id"]);

        if (nid == "msvc")
        {
          args.push_back ("/NOLOGO");
          args.push_back ("/SYMBOLS");
        }
        else
        {
          // Note that llvm-nm's --no-weak is only available since LLVM 7.
          //
          args.push_back ("--extern-only");
          args.push_back ("--format=posix");
        }
      }

      args.push_back (nullptr); // Argument placeholder.
      args.push_back (nullptr);

      const char*& arg (*(args.end () - 2));

      if (verb == 1)
        text << "def " << t;

      // Extract symbols from each object file.
      //
      symbols syms;
      for (const objs& o: os)
      {
        // Use a relative path for nicer diagnostics.
        //
        path rp (relative (o.path ()));
        arg = rp.string ().c_str ();

        if (verb >= 2)
          print_process (args);

        if (ctx.dry_run)
          continue;

        // Both dumpbin.exe and nm send their output to stdout. While nm sends
        // diagnostics to stderr, dumpbin sends it to stdout together with the
        // output.
        //
        process pr (run_start (nm,
                               args,
                               0     /* stdin */,
                               -1    /* stdout */));
        bool io (false);
        try
        {
          ifdstream is (
            move (pr.in_ofd), fdstream_mode::skip, ifdstream::badbit);

          if (lid == "msvc" || nid == "msvc")
            read_dumpbin (is, syms);
          else
            read_posix_nm (is, syms);

          is.close ();
        }
        catch (const io_error&)
        {
          // Presumably the child process failed so let run_finish() deal with
          // that first.
          //
          io = true;
        }

        if (!run_finish_code (args.data (), pr) || io)
          fail << "unable to extract symbols from " << arg;
      }

#if 0
      for (const string& s: syms.d) text << "D " << s;
      for (const string& s: syms.r) text << "R " << s;
      for (const string& s: syms.b) text << "B " << s;
      for (const string& s: syms.c) text << "C " << s;
      for (const string& s: syms.t) text << "T " << s;
#endif

      if (verb >= 3)
        text << "cat >" << tp;

      if (!ctx.dry_run)
      {
        const auto& tgt (cast<target_triplet> (rs["bin.target"]));

        bool i386 (tgt.cpu.size () == 4 &&
                   tgt.cpu[0] == 'i' && tgt.cpu[2] == '8' && tgt.cpu[3] == '6');

        auto_rmfile rm (tp);
        try
        {
          ofdstream os (tp);

          os << "; Auto-generated, do not edit.\n"
             << "EXPORTS\n";

          if (tgt.system == "mingw32")
            write_mingw32 (os, syms, i386);
          else
            write_win32_msvc (os, syms, i386);

          os.close ();
          rm.cancel ();
        }
        catch (const io_error& e)
        {
          fail << "unable to write to " << tp << ": " << e;
        }

        dd.check_mtime (tp);
      }

      t.mtime (system_clock::now ());
      return target_state::changed;
    }

    const string def_rule::rule_id_ {"bin.def 2"};
  }
}