From 73961c9624531b6f6326dac4ab0d5c462a2cff94 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Fri, 12 May 2023 07:33:40 +0200 Subject: Add infrastructure for controller URL priorities This is the ground work for the task priority/interrupt support. --- bbot/agent/agent.cli | 58 ++++++++++++- bbot/agent/agent.cxx | 235 ++++++++++++++++++++++++++++++++++----------------- 2 files changed, 212 insertions(+), 81 deletions(-) (limited to 'bbot') diff --git a/bbot/agent/agent.cli b/bbot/agent/agent.cli index 3d028fd..1046685 100644 --- a/bbot/agent/agent.cli +++ b/bbot/agent/agent.cli @@ -12,19 +12,73 @@ include ; namespace bbot { { - " ", + " ", " \h|SYNOPSIS| \c{\b{bbot-agent --help}\n \b{bbot-agent --version}\n - \b{bbot-agent} [] ...} + \b{bbot-agent} [] [=]...} \h|DESCRIPTION| \cb{bbot-agent} @@ TODO. + The controller URL is a four or five-digit decimal value. If it + is absent, then \cb{0} (lowest priority) is assumed. URLs with equal + priority are queried at random. + + The value has the \c{[\i{F}]\i{DCBA}} form which encodes four + priority levels (\ci{DCBA}) each occupying one decimal digit (so there are + 10 distinct priorities in each level) plus the optional boost flag + (\ci{F}). These levels offer different trade-offs between the speed of + completing a higher priority task and potentially discarding work that has + already been done. + + The first priority level (\ci{A}) is a simple preference: among the URLs + with equal values for other levels (\ci{DCB}), those with higher first + level priorities are queried first. + + The second priority level (\ci{B}) has the semantics of the first level + plus it prevents URLs with lower second priority level from being + queried until the task with a higher second priority level has completed, + effectively conserving the resources for the higher priority task. + + The third priority level (\ci{C}) has the semantics of the second level + plus it may interrupt one lower third priority level task in order to + perform the higher third priority task (the interrupt is necessary if the + desired machine is used by the lower priority task or the number of tasks + already being performed is the maximum allowed to be performed + concurrently; see \cb{--instance-max}). + + Finally, the fourth priority level (\ci{D}) has the semantics of the third + level except that not one but all the lower fourth priority level tasks + are interrupting, effectively dedicating all the available resources to + the higher priority task. This level can also be combined with the boost + flag \ci{F}. If this flag is \cb{1} then the higher priority task's CPU + number (\cb{--cpu}) is boosted to the full number of available hardware + threads (or, to view it another way, the fourth priority level has 20 + possible values, not 10, with the first 0-9 being without the boost while + the last 10-19 being with the boost). + + Note that the priority levels are hierarchical in a sense that within a + given higher level URLs can be further prioritized using the lower + levels. As an example, consider a deployment with three controller URLs: + background package rebuilds (\cb{pkg.example.org}), user-initiated CI + (\cb{ci.example.org}), and user-initiated interactive CI + (\cb{ici.example.org}). Given the following priorities: + + \ + 0000=https://pkg.example.org + 0100=https://ci.example.org + 0101=https://ici.example.org + \ + + Both types of CI tasks will interrupt one background rebuild task if + necessary while the interactive CI tasks will be merely preferred over + non-interactive. + Note that on termination \cb{bbot-agent} may leave behind a machine lock and working machine snapshot. It is expected that the caller (normally Build OS monitor) cleans them up before restarting the agent. diff --git a/bbot/agent/agent.cxx b/bbot/agent/agent.cxx index a8b7c77..850f2b1 100644 --- a/bbot/agent/agent.cxx +++ b/bbot/agent/agent.cxx @@ -20,8 +20,10 @@ #include #include +#include #include #include +#include // setw() #include #include // generic_category() @@ -1635,20 +1637,57 @@ try offset = (tc_num - 1) * 100 + inst; - // Controller URLs. + // Controller priority to URLs map. // - if (argc < 2 && - !ops.dump_machines () && - !ops.fake_request_specified ()) + std::map controllers; + + for (int i (1); i != argc; ++i) { - fail << "controller url expected" << - info << "run " << argv[0] << " --help for details"; - } + // [=] + // + string a (argv[i]); + + // See if we have priority, falling back to priority 0 if absent. + // + uint64_t prio (0); - strings controllers; + // Note that we can also have `=` in (e.g., parameters) so we will + // only consider `=` as ours if prior to it we only have digits. + // + size_t p (a.find ('=')); + if (p != string::npos && a.find_first_not_of ("0123456789") == p) + { + // Require exactly four or five digits in case we later need to extend + // the priority levels beyond the 10 possible values (e.g., DDCCBBAA). + // + if (p != 4 && p != 5) + fail << "four or five-digit controller url priority expected in '" + << a << "'"; - for (int i (1); i != argc; ++i) - controllers.push_back (argv[i]); + char* e; + errno = 0; + prio = strtoull (a.c_str (), &e, 10); + assert (errno != ERANGE && e == a.c_str () + p); + + if (prio > 19999) + fail << "out of bounds controller url priority in '" << a << "'"; + + a.erase (0, p + 1); + } + + controllers[prio].push_back (move (a)); + } + + if (controllers.empty ()) + { + if (ops.dump_machines () || ops.fake_request_specified ()) + { + controllers[0].push_back ("https://example.org"); + } + else + fail << "controller url expected" << + info << "run " << argv[0] << " --help for details"; + } // Handle SIGHUP and SIGTERM. // @@ -1710,8 +1749,16 @@ try if (inst_max != 0) dr << info << "instance max " << inst_max; - for (const string& u: controllers) - dr << info << "controller url " << u; + // Note: keep last since don't restore fill/setw. + // + for (const pair& p: controllers) + { + for (const string& u: p.second) + { + dr.os.fill ('0'); + dr << info << "controller url " << std::setw (4) << p.first << '=' << u; + } + } } // The work loop. The steps we go through are: @@ -1815,79 +1862,94 @@ try continue; } - // Prepare task request. + // If we get a task, these contain all the corresponding information. // - task_request_manifest tq { - hname, - tc_name, - tc_ver, - imode, - ilogin, - fingerprint, - machine_header_manifests {} - }; - - // Note: do not assume tq.machines.size () == ms.size (). + task_request_manifest tq; + task_response_manifest tr; + uint64_t prio; + string url; + + // Iterate over controller priorities in reverse, that is, from highest to + // lowest. // - for (const bootstrapped_machine& m: ms) - { - // @@ For now skip machines locked by other processes. - // - // @@ Note: skip machines being bootstrapped. - // - if (m.lock.locked ()) - tq.machines.emplace_back (m.manifest.machine.id, - m.manifest.machine.name, - m.manifest.machine.summary); - } + // @@ Note: doing it in terms of direct iterators in anticipation for + // lower_bound(). + // + auto cb (controllers.begin ()); + auto ce (controllers.end ()); - if (ops.dump_machines ()) + for (; cb != ce; ) { - for (const machine_header_manifest& m: tq.machines) - serialize_manifest (m, cout, "stdout", "machine"); + const pair& pu (*--ce); - return 0; - } + prio = pu.first; + const strings& urls (pu.second); - if (tq.machines.empty ()) - { - // Normally this means all the machines are locked so sleep a bit less. + // Prepare task request (it will be the same within a given priority). // - sleep = rand_sleep () / 2; - continue; - } + tq = task_request_manifest { + hname, + tc_name, + tc_ver, + imode, + ilogin, + fingerprint, + machine_header_manifests {}}; + + // Note: do not assume tq.machines.size () == ms.size (). + // + for (const bootstrapped_machine& m: ms) + { + // @@ For now skip machines locked by other processes. + // + // @@ Note: skip machines being bootstrapped. + // + if (m.lock.locked ()) + tq.machines.emplace_back (m.manifest.machine.id, + m.manifest.machine.name, + m.manifest.machine.summary); + } - // Send task requests. - // - // Note that we have to do it while holding the lock on all the machines - // since we don't know which machine we will need. - // - string url; - task_response_manifest tr; + if (ops.dump_machines ()) + { + for (const machine_header_manifest& m: tq.machines) + serialize_manifest (m, cout, "stdout", "machine"); - if (ops.fake_request_specified ()) - { - auto t (parse_manifest (ops.fake_request (), "task")); + return 0; + } - tr = task_response_manifest { - "fake-session", // Dummy session. - nullopt, // No challenge. - url, // Empty result URL. - agent_checksum, - move (t)}; + if (tq.machines.empty ()) + { + // If we have no machines for this priority then we won't have any + // for any lower priority so bail out. + // + break; + } - url = "http://example.org"; - } - else - { - // Note that after completing each task we always start from the - // beginning of the list. This fact can be used to implement a poor - // man's priority system where we will continue serving the first listed - // controller for as long as it has tasks (and maybe in the future we - // will implement a proper priority system). + // Send task requests. + // + // Note that we have to do it while holding the lock on all the machines + // since we don't know which machine we will need. + // + // @@ TODO: need to iterate in random order somehow. // - for (const string& u: controllers) + for (const string& u: urls) { + if (ops.fake_request_specified ()) + { + auto t (parse_manifest (ops.fake_request (), "task")); + + tr = task_response_manifest { + "fake-session", // Dummy session. + nullopt, // No challenge. + string (), // Empty result URL. + agent_checksum, + move (t)}; + + url = u; + break; + } + task_response_manifest r; try @@ -1903,8 +1965,9 @@ try "--max-time", ops.request_timeout (), "--connect-timeout", ops.connect_timeout ()); - // This is tricky/hairy: we may fail hard parsing the output before - // seeing that curl exited with an error and failing softly. + // This is tricky/hairy: we may fail hard parsing the output + // before seeing that curl exited with an error and failing + // softly. // bool f (false); @@ -1949,8 +2012,8 @@ try { const task_manifest& t (*r.task); - // For security reasons let's require the repository location to be - // remote. + // For security reasons let's require the repository location to + // be remote. // if (t.repository.local ()) { @@ -1973,13 +2036,27 @@ try l2 ([&]{trace << "task for " << t.name << '/' << t.version << " " << "on " << t.machine << " " - << "from " << u;}); + << "from " << u << " " + << "priority " << prio;}); tr = move (r); url = u; break; } - } + } // url loop. + + if (!tr.session.empty ()) // Got a task. + break; + + } // prio loop. + + if (tq.machines.empty ()) // No machines. + { + // Normally this means all the machines are busy so sleep a bit less. + // + l2 ([&]{trace << "all machines are busy, sleeping";}); + sleep = rand_sleep () / 2; + continue; } if (tr.session.empty ()) // No task from any of the controllers. @@ -2006,7 +2083,7 @@ try { if (mh.name == m.manifest.machine.name) { - m.lock.write (tl, 1234 /* prio */); + m.lock.write (tl, prio); pm = &m; } else -- cgit v1.1