Skip to content

Commit d3013c8

Browse files
committed
CCBC-1646: Server groups for replica reads
Change-Id: Ia3aa07ce4bb8c0fc81aecd38b999fe16a0574fb8 Reviewed-on: https://review.couchbase.org/c/libcouchbase/+/216990 Tested-by: Build Bot <build@couchbase.com> Reviewed-by: Jared Casey <jared.casey@couchbase.com>
1 parent db7187e commit d3013c8

10 files changed

Lines changed: 361 additions & 55 deletions

File tree

include/libcouchbase/cntl.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1326,11 +1326,27 @@ typedef enum {
13261326
*/
13271327
#define LCB_CNTL_ENABLE_OP_METRICS 0x67
13281328

1329+
/**
1330+
* Select server group to use for replica APIs.
1331+
*
1332+
* For some use-cases it might be necessary to restrict list of the nodes,
1333+
* that are used in replica read APIs to single server group to optimize
1334+
* network costs.
1335+
*
1336+
* Use `preferred_server_group` in the connection string.
1337+
*
1338+
* @committed
1339+
* @cntl_arg_both{const char*}
1340+
*
1341+
* @see https://docs.couchbase.com/server/current/manage/manage-groups/manage-groups.html
1342+
*/
1343+
#define LCB_CNTL_PREFERRED_SERVER_GROUP 0x68
1344+
13291345
/**
13301346
* This is not a command, but rather an indicator of the last item.
13311347
* @internal
13321348
*/
1333-
#define LCB_CNTL__MAX 0x68
1349+
#define LCB_CNTL__MAX 0x69
13341350
/**@}*/
13351351

13361352
#ifdef __cplusplus

include/libcouchbase/couchbase.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -793,6 +793,11 @@ typedef enum {
793793
LCB_REPLICA_MODE__MAX
794794
} lcb_REPLICA_MODE;
795795

796+
typedef enum {
797+
LCB_REPLICA_READ_PREFERENCE_NONE = 0x00,
798+
LCB_REPLICA_READ_PREFERENCE_SELECTED_SERVER_GROUP = 0x01,
799+
} lcb_REPLICA_READ_PREFERENCE;
800+
796801
typedef struct lcb_RESPGETREPLICA_ lcb_RESPGETREPLICA;
797802

798803
LIBCOUCHBASE_API lcb_STATUS lcb_respgetreplica_status(const lcb_RESPGETREPLICA *resp);
@@ -817,6 +822,8 @@ LIBCOUCHBASE_API lcb_STATUS lcb_cmdgetreplica_collection(lcb_CMDGETREPLICA *cmd,
817822
const char *collection, size_t collection_len);
818823
LIBCOUCHBASE_API lcb_STATUS lcb_cmdgetreplica_key(lcb_CMDGETREPLICA *cmd, const char *key, size_t key_len);
819824
LIBCOUCHBASE_API lcb_STATUS lcb_cmdgetreplica_timeout(lcb_CMDGETREPLICA *cmd, uint32_t timeout);
825+
LIBCOUCHBASE_API lcb_STATUS lcb_cmdgetreplica_read_preference(lcb_CMDGETREPLICA *cmd,
826+
lcb_REPLICA_READ_PREFERENCE preference);
820827
/**
821828
* @internal Internal: This should never be used and is not supported.
822829
*/

include/libcouchbase/vbucket.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ typedef struct {
110110
char *alt_hostname; /**< selected alternative hostname for the node */
111111
lcbvb_SERVICES alt_svc; /**< selected alternative plain services */
112112
lcbvb_SERVICES alt_svc_ssl; /**< selected alternative SSL Services */
113+
char *server_group; /**< the group name or NULL */
113114
} lcbvb_SERVER;
114115

115116
/**@volatile. ABI/API compatibility not guaranteed between versions */

src/capi/cmd_get_replica.hh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,17 @@ struct lcb_CMDGETREPLICA_ {
7575
return LCB_SUCCESS;
7676
}
7777

78+
lcb_STATUS read_preference(lcb_REPLICA_READ_PREFERENCE preference)
79+
{
80+
read_preference_ = preference;
81+
return LCB_SUCCESS;
82+
}
83+
84+
lcb_REPLICA_READ_PREFERENCE read_preference() const
85+
{
86+
return read_preference_;
87+
}
88+
7889
lcb_STATUS collection(lcb::collection_qualifier collection)
7990
{
8091
collection_ = std::move(collection);
@@ -194,6 +205,7 @@ struct lcb_CMDGETREPLICA_ {
194205
int select_index_{0};
195206
std::string impostor_{};
196207
std::vector<std::string> extra_privileges_{};
208+
lcb_REPLICA_READ_PREFERENCE read_preference_{LCB_REPLICA_READ_PREFERENCE_NONE};
197209
};
198210

199211
struct lcb_RESPGETREPLICA_ {

src/cntl.cc

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -747,6 +747,22 @@ HANDLER(network_handler)
747747
return LCB_SUCCESS;
748748
}
749749

750+
HANDLER(preferred_server_group_handler)
751+
{
752+
if (mode == LCB_CNTL_SET) {
753+
const char *val = reinterpret_cast<const char *>(arg);
754+
free(LCBT_SETTING(instance, preferred_server_group));
755+
LCBT_SETTING(instance, network) = nullptr;
756+
if (val) {
757+
LCBT_SETTING(instance, preferred_server_group) = lcb_strdup(val);
758+
}
759+
} else {
760+
*(const char **)arg = LCBT_SETTING(instance, preferred_server_group);
761+
}
762+
(void)cmd;
763+
return LCB_SUCCESS;
764+
}
765+
750766
HANDLER(durable_write_handler){RETURN_GET_SET(int, LCBT_SETTING(instance, enable_durable_write))}
751767

752768
HANDLER(unordered_execution_handler)
@@ -860,6 +876,7 @@ static ctl_handler handlers[] = {
860876
enable_errmap_handler, /* LCB_CNTL_ENABLE_ERRMAP */
861877
timeout_common, /* LCB_CNTL_OP_METRICS_FLUSH_INTERVAL */
862878
enable_op_metrics_handler, /* LCB_CNTL_ENABLE_OP_METRICS */
879+
preferred_server_group_handler, /* LCB_CNTL_PREFERRED_SERVER_GROUP */
863880
nullptr
864881
};
865882
/* clang-format on */
@@ -1101,6 +1118,7 @@ static cntl_OPCODESTRS stropcode_map[] = {
11011118
{"enable_errmap", LCB_CNTL_ENABLE_ERRMAP, convert_intbool},
11021119
{"operation_metrics_flush_interval", LCB_CNTL_OP_METRICS_FLUSH_INTERVAL, convert_timevalue},
11031120
{"enable_operation_metrics", LCB_CNTL_ENABLE_OP_METRICS, convert_intbool},
1121+
{"preferred_server_group", LCB_CNTL_PREFERRED_SERVER_GROUP, convert_passthru},
11041122
{nullptr, -1}};
11051123

11061124
#define CNTL_NUM_HANDLERS (sizeof(handlers) / sizeof(handlers[0]))

src/operations/get_replica.cc

Lines changed: 156 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@
2424
#include "capi/cmd_get.hh"
2525
#include "capi/cmd_get_replica.hh"
2626

27+
#include <random>
28+
#include <algorithm>
29+
2730
LIBCOUCHBASE_API lcb_STATUS lcb_respgetreplica_status(const lcb_RESPGETREPLICA *resp)
2831
{
2932
return resp->ctx.rc;
@@ -139,6 +142,12 @@ LIBCOUCHBASE_API lcb_STATUS lcb_cmdgetreplica_key(lcb_CMDGETREPLICA *cmd, const
139142
return cmd->key(std::string(key, key_len));
140143
}
141144

145+
LIBCOUCHBASE_API lcb_STATUS lcb_cmdgetreplica_read_preference(lcb_CMDGETREPLICA *cmd,
146+
lcb_REPLICA_READ_PREFERENCE preference)
147+
{
148+
return cmd->read_preference(preference);
149+
}
150+
142151
LIBCOUCHBASE_API lcb_STATUS lcb_cmdgetreplica_on_behalf_of(lcb_CMDGETREPLICA *cmd, const char *data, size_t data_len)
143152
{
144153
return cmd->on_behalf_of(std::string(data, data_len));
@@ -243,6 +252,90 @@ RGetCookie::RGetCookie(void *cookie_, lcb_INSTANCE *instance_, get_replica_mode
243252
{
244253
}
245254

255+
struct readable_node {
256+
bool is_replica;
257+
std::size_t node_index; // index of the node in configuration
258+
std::size_t replica_index; // zero-based index of the replica
259+
};
260+
261+
template <typename COMMAND>
262+
static std::vector<readable_node> select_effective_node_indexes(lcb_INSTANCE *instance, COMMAND cmd)
263+
{
264+
mc_CMDQUEUE *cq = &instance->cmdq;
265+
int vbid;
266+
int active_index;
267+
lcb_KEYBUF keybuf{LCB_KV_COPY, {cmd->key().c_str(), cmd->key().size()}};
268+
mcreq_map_key(cq, &keybuf, MCREQ_PKT_BASESIZE, &vbid, &active_index);
269+
270+
if (active_index < 0) {
271+
return {};
272+
}
273+
274+
std::vector<readable_node> effective_nodes{};
275+
276+
if (cmd->mode() == get_replica_mode::select) {
277+
int replica_index = cmd->selected_replica_index();
278+
int node_index = lcbvb_vbreplica(cq->config, vbid, replica_index);
279+
if (node_index >= 0) {
280+
effective_nodes.push_back(
281+
{true, static_cast<std::size_t>(node_index), static_cast<std::size_t>(replica_index)});
282+
}
283+
return effective_nodes;
284+
}
285+
286+
std::string preferred_server_group{};
287+
bool use_preferred_server_group{cmd->read_preference() == LCB_REPLICA_READ_PREFERENCE_SELECTED_SERVER_GROUP};
288+
289+
if (use_preferred_server_group) {
290+
if (instance->settings->preferred_server_group == nullptr) {
291+
return {};
292+
} else {
293+
preferred_server_group = instance->settings->preferred_server_group;
294+
}
295+
}
296+
297+
/* Make sure they're all online */
298+
for (std::size_t ii = 0; ii < LCBT_NREPLICAS(instance); ii++) {
299+
int node_index = lcbvb_vbreplica(cq->config, vbid, ii);
300+
if (node_index < 0) {
301+
if (cmd->mode() == get_replica_mode::all) {
302+
return {};
303+
}
304+
continue;
305+
}
306+
if (use_preferred_server_group) {
307+
const char *server_group = cq->config->servers[node_index].server_group;
308+
if (server_group != nullptr && preferred_server_group == server_group) {
309+
effective_nodes.push_back({true, static_cast<std::size_t>(node_index), ii});
310+
}
311+
} else {
312+
effective_nodes.push_back({true, static_cast<std::size_t>(node_index), ii});
313+
}
314+
}
315+
316+
if (cmd->mode() == get_replica_mode::any) {
317+
if (effective_nodes.empty()) {
318+
return {};
319+
}
320+
static std::random_device rd;
321+
std::mt19937 g(rd());
322+
std::shuffle(effective_nodes.begin(), effective_nodes.end(), g);
323+
return {effective_nodes.front()};
324+
}
325+
326+
/* read from active */
327+
if (use_preferred_server_group) {
328+
const char *active_server_group = cq->config->servers[active_index].server_group;
329+
if (active_server_group != nullptr && preferred_server_group == active_server_group) {
330+
effective_nodes.push_back({false, static_cast<std::size_t>(active_index), 0xff});
331+
}
332+
} else {
333+
effective_nodes.push_back({false, static_cast<std::size_t>(active_index), 0xff});
334+
}
335+
336+
return effective_nodes;
337+
}
338+
246339
static lcb_STATUS get_replica_validate(lcb_INSTANCE *instance, const lcb_CMDGETREPLICA *cmd)
247340
{
248341
if (cmd->key().empty()) {
@@ -283,6 +376,13 @@ static lcb_STATUS get_replica_validate(lcb_INSTANCE *instance, const lcb_CMDGETR
283376
break;
284377

285378
case get_replica_mode::all:
379+
if (cmd->read_preference() == LCB_REPLICA_READ_PREFERENCE_SELECTED_SERVER_GROUP) {
380+
auto effective_nodes = select_effective_node_indexes(instance, cmd);
381+
if (effective_nodes.empty()) {
382+
return LCB_ERR_DOCUMENT_UNRETRIEVABLE;
383+
}
384+
}
385+
286386
r0 = 0;
287387
r1 = LCBT_NREPLICAS(instance);
288388
/* Make sure they're all online */
@@ -309,7 +409,6 @@ static lcb_STATUS get_replica_schedule(lcb_INSTANCE *instance, std::shared_ptr<l
309409
*/
310410
mc_CMDQUEUE *cq = &instance->cmdq;
311411
int vbid, ixtmp;
312-
protocol_binary_request_header req{};
313412
unsigned r0 = 0, r1 = 0;
314413

315414
lcb_KEYBUF keybuf{LCB_KV_COPY, {cmd->key().c_str(), cmd->key().size()}};
@@ -355,6 +454,11 @@ static lcb_STATUS get_replica_schedule(lcb_INSTANCE *instance, std::shared_ptr<l
355454
return LCB_ERR_NO_MATCHING_SERVER;
356455
}
357456

457+
const auto effective_nodes = select_effective_node_indexes(instance, cmd);
458+
if (effective_nodes.empty()) {
459+
return LCB_ERR_DOCUMENT_UNRETRIEVABLE;
460+
}
461+
358462
std::vector<std::uint8_t> framing_extras;
359463
if (cmd->want_impersonation()) {
360464
lcb_STATUS err = lcb::flexible_framing_extras::encode_impersonate_user(cmd->impostor(), framing_extras);
@@ -376,68 +480,67 @@ static lcb_STATUS get_replica_schedule(lcb_INSTANCE *instance, std::shared_ptr<l
376480
rck->start + cmd->timeout_or_default_in_nanoseconds(LCB_US2NS(LCBT_SETTING(instance, operation_timeout)));
377481

378482
/* Initialize the packet */
483+
protocol_binary_request_header req{};
379484
req.request.magic = framing_extras.empty() ? PROTOCOL_BINARY_REQ : PROTOCOL_BINARY_AREQ;
380-
req.request.opcode = PROTOCOL_BINARY_CMD_GET_REPLICA;
381485
req.request.datatype = PROTOCOL_BINARY_RAW_BYTES;
382486
req.request.vbucket = htons(static_cast<std::uint16_t>(vbid));
383487
req.request.cas = 0;
384488
req.request.extlen = 0;
385489

386490
auto ffextlen = static_cast<std::uint8_t>(framing_extras.size());
387491

388-
rck->r_cur = r0;
389-
do {
390-
int curix;
391-
mc_PIPELINE *pl;
392-
mc_PACKET *pkt;
393-
394-
curix = lcbvb_vbreplica(cq->config, vbid, r0);
395-
/* XXX: this is always expected to be in range. For the FIRST mode
396-
* it will seek to the first valid index (checked above), and for the
397-
* ALL mode, it will fail if not all replicas are already online
398-
* (also checked above) */
399-
pl = cq->pipelines[curix];
400-
pkt = mcreq_allocate_packet(pl);
401-
if (!pkt) {
402-
delete rck;
403-
return LCB_ERR_NO_MEMORY;
404-
}
492+
for (auto node : effective_nodes) {
493+
if (node.is_replica) {
494+
req.request.opcode = PROTOCOL_BINARY_CMD_GET_REPLICA;
495+
rck->r_cur = node.replica_index;
496+
mc_PIPELINE *pl;
497+
mc_PACKET *pkt;
498+
499+
/* XXX: this is always expected to be in range. For the FIRST mode
500+
* it will seek to the first valid index (checked above), and for the
501+
* ALL mode, it will fail if not all replicas are already online
502+
* (also checked above) */
503+
pl = cq->pipelines[node.node_index];
504+
pkt = mcreq_allocate_packet(pl);
505+
if (!pkt) {
506+
delete rck;
507+
return LCB_ERR_NO_MEMORY;
508+
}
405509

406-
pkt->u_rdata.exdata = rck;
407-
pkt->flags |= MCREQ_F_REQEXT;
408-
409-
mcreq_reserve_key(pl, pkt, sizeof(req.bytes) + ffextlen, &keybuf, cmd->collection().collection_id());
410-
size_t nkey = pkt->kh_span.size - MCREQ_PKT_BASESIZE + pkt->extlen;
411-
req.request.keylen = htons((uint16_t)nkey);
412-
req.request.bodylen = htonl((uint32_t)nkey + framing_extras.size());
413-
req.request.opaque = pkt->opaque;
414-
rck->remaining++;
415-
mcreq_write_hdr(pkt, &req);
416-
if (!framing_extras.empty()) {
417-
memcpy(SPAN_BUFFER(&pkt->kh_span) + sizeof(req.bytes), framing_extras.data(), framing_extras.size());
418-
}
419-
mcreq_sched_add(pl, pkt);
420-
} while (++r0 < r1);
421-
422-
if (cmd->need_get_active()) {
423-
req.request.opcode = PROTOCOL_BINARY_CMD_GET;
424-
mc_PIPELINE *pl;
425-
mc_PACKET *pkt;
426-
lcb_STATUS err = mcreq_basic_packet(cq, &keybuf, cmd->collection().collection_id(), &req, 0, ffextlen, &pkt,
427-
&pl, MCREQ_BASICPACKET_F_FALLBACKOK);
428-
if (err != LCB_SUCCESS) {
429-
delete rck;
430-
return err;
431-
}
432-
req.request.opaque = pkt->opaque;
433-
pkt->u_rdata.exdata = rck;
434-
pkt->flags |= MCREQ_F_REQEXT;
435-
rck->remaining++;
436-
mcreq_write_hdr(pkt, &req);
437-
if (!framing_extras.empty()) {
438-
memcpy(SPAN_BUFFER(&pkt->kh_span) + sizeof(req.bytes), framing_extras.data(), framing_extras.size());
510+
pkt->u_rdata.exdata = rck;
511+
pkt->flags |= MCREQ_F_REQEXT;
512+
513+
mcreq_reserve_key(pl, pkt, sizeof(req.bytes) + ffextlen, &keybuf, cmd->collection().collection_id());
514+
size_t nkey = pkt->kh_span.size - MCREQ_PKT_BASESIZE + pkt->extlen;
515+
req.request.keylen = htons((uint16_t)nkey);
516+
req.request.bodylen = htonl((uint32_t)nkey + framing_extras.size());
517+
req.request.opaque = pkt->opaque;
518+
rck->remaining++;
519+
mcreq_write_hdr(pkt, &req);
520+
if (!framing_extras.empty()) {
521+
memcpy(SPAN_BUFFER(&pkt->kh_span) + sizeof(req.bytes), framing_extras.data(), framing_extras.size());
522+
}
523+
mcreq_sched_add(pl, pkt);
524+
} else {
525+
req.request.opcode = PROTOCOL_BINARY_CMD_GET;
526+
mc_PIPELINE *pl;
527+
mc_PACKET *pkt;
528+
lcb_STATUS err = mcreq_basic_packet(cq, &keybuf, cmd->collection().collection_id(), &req, 0, ffextlen, &pkt,
529+
&pl, MCREQ_BASICPACKET_F_FALLBACKOK);
530+
if (err != LCB_SUCCESS) {
531+
delete rck;
532+
return err;
533+
}
534+
req.request.opaque = pkt->opaque;
535+
pkt->u_rdata.exdata = rck;
536+
pkt->flags |= MCREQ_F_REQEXT;
537+
rck->remaining++;
538+
mcreq_write_hdr(pkt, &req);
539+
if (!framing_extras.empty()) {
540+
memcpy(SPAN_BUFFER(&pkt->kh_span) + sizeof(req.bytes), framing_extras.data(), framing_extras.size());
541+
}
542+
mcreq_sched_add(pl, pkt);
439543
}
440-
mcreq_sched_add(pl, pkt);
441544
}
442545

443546
MAYBE_SCHEDLEAVE(instance)

0 commit comments

Comments
 (0)