mirror of https://github.com/valkey-io/valkey
6515 lines
253 KiB
C
6515 lines
253 KiB
C
/*
|
|
* Copyright (c) 2009-2012, Redis Ltd.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* * Neither the name of Redis nor the names of its contributors may be used
|
|
* to endorse or promote products derived from this software without
|
|
* specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include "server.h"
|
|
#include "cluster.h"
|
|
#include "cluster_slot_stats.h"
|
|
#include "cluster_migrateslots.h"
|
|
#include "script.h"
|
|
#include "intset.h"
|
|
#include "sds.h"
|
|
#include "fpconv_dtoa.h"
|
|
#include "fmtargs.h"
|
|
#include "io_threads.h"
|
|
#include "module.h"
|
|
#include "connection.h"
|
|
#include "zmalloc.h"
|
|
#include <strings.h>
|
|
#include <sys/socket.h>
|
|
#include <sys/uio.h>
|
|
#include <math.h>
|
|
#include <ctype.h>
|
|
#include <stdatomic.h>
|
|
#include <stdbool.h>
|
|
|
|
/* This struct is used to encapsulate filtering criteria for operations on clients
|
|
* such as identifying specific clients to kill or retrieve. Each field in the struct
|
|
* represents a filter that can be applied based on specific attributes of a client. */
|
|
typedef struct {
|
|
/* A set of client IDs to filter. If NULL, no ID filtering is applied. */
|
|
intset *ids;
|
|
intset *not_ids;
|
|
/* Maximum age (in seconds) of a client connection for filtering.
|
|
* Connections younger than this value will not match.
|
|
* A value of 0 means no age filtering. */
|
|
long long max_age;
|
|
/* Address/port of the client. If NULL, no address filtering is applied. */
|
|
char *addr;
|
|
char *not_addr;
|
|
/* Remote address/port of the client. If NULL, no address filtering is applied. */
|
|
char *laddr;
|
|
char *not_laddr;
|
|
/* Filtering clients by authentication user. If NULL, no user-based filtering is applied. */
|
|
user *user;
|
|
user *not_user;
|
|
/* Client type to filter. If set to -1, no type filtering is applied. */
|
|
int type;
|
|
int not_type;
|
|
/* Boolean flag to determine if the current client (`me`) should be filtered. 1 means "skip me", 0 means otherwise. */
|
|
int skipme;
|
|
/* Client name to filter. If NULL, no name filtering is applied. */
|
|
char *name;
|
|
char *not_name;
|
|
/* Idle time (in seconds) of a client connection for filtering.
|
|
* Connections with idle time more than this value will match.
|
|
* A value of 0 means no idle time filtering. */
|
|
long long idle;
|
|
/* Client flags for filtering. If NULL, no filtering is applied. */
|
|
sds flags;
|
|
sds not_flags;
|
|
/* Library name to filter. If NULL, no library name filtering is applied. */
|
|
robj *lib_name;
|
|
robj *not_lib_name;
|
|
/* Library version to filter. If NULL, no library version filtering is applied. */
|
|
robj *lib_ver;
|
|
robj *not_lib_ver;
|
|
/* Database index to filter. If set to -1, no DB number filtering is applied. */
|
|
int db_number;
|
|
int not_db_number;
|
|
/* Client capa for filtering. If NULL, no filtering is applied. */
|
|
sds capa;
|
|
sds not_capa;
|
|
/* Client ip for filtering. If NULL, no filtering is applied. */
|
|
sds ip;
|
|
sds not_ip;
|
|
} clientFilter;
|
|
|
|
/* Types of payloads in reply buffers (c->buf and c->reply)
|
|
* Unencoded buffers contain plain replies only
|
|
* Encoded buffers contain headers followed by either plain replies or
|
|
* by bulk string references */
|
|
typedef enum {
|
|
PLAIN_REPLY = 0, /* plain reply */
|
|
BULK_STR_REF /* bulk string references */
|
|
} payloadType;
|
|
|
|
/* Encoded reply buffers consist from chunks
|
|
* Each chunk contains header followed by payload
|
|
* The packed attribute is specified because buffer is accessed at arbitrary offsets,
|
|
* so no benefit in data structure padding and applying packed saves the space in the buffer */
|
|
typedef struct __attribute__((__packed__)) payloadHeader {
|
|
size_t payload_len; /* payload length in a reply buffer */
|
|
size_t reply_len; /* actual reply length for non-plain payloads */
|
|
uint8_t payload_type; /* one of payloadType */
|
|
int16_t slot; /* to report network-bytes-out for BULK_STR_REF chunks */
|
|
} payloadHeader;
|
|
|
|
/* To avoid copy of whole string in reply buffer
|
|
* we store pointers to object and string itself */
|
|
typedef struct __attribute__((__packed__)) bulkStrRef {
|
|
robj *obj; /* pointer to object used for reference count management */
|
|
sds str; /* pointer to string to optimize memory access by I/O thread */
|
|
} bulkStrRef;
|
|
|
|
static void setProtocolError(const char *errstr, client *c);
|
|
static void pauseClientsByClient(mstime_t end, int isPauseClientAll);
|
|
int postponeClientRead(client *c);
|
|
char *getClientSockname(client *c);
|
|
static int parseClientFiltersOrReply(client *c, int index, clientFilter *filter);
|
|
static int clientMatchesFilter(client *client, clientFilter *client_filter);
|
|
static int validateClientFlagFilter(sds flag_filter);
|
|
static int validateClientCapaFilter(sds capa);
|
|
static sds getAllFilteredClientsInfoString(clientFilter *client_filter, int hide_user_data);
|
|
static int clientMatchesFlagFilter(client *c, sds flag_filter);
|
|
static int clientMatchesIpFilter(client *c, sds ip);
|
|
static int clientMatchesCapaFilter(client *c, sds capa_filter);
|
|
static void freeClientFilter(clientFilter *filter);
|
|
static bool consumeCommandQueue(client *c);
|
|
static int parseMultibulk(client *c,
|
|
int *argc,
|
|
robj ***argv,
|
|
int *argv_len,
|
|
size_t *argv_len_sum,
|
|
unsigned long long *net_input_bytes_curr_cmd);
|
|
|
|
int ProcessingEventsWhileBlocked = 0; /* See processEventsWhileBlocked(). */
|
|
_Thread_local sds thread_shared_qb = NULL;
|
|
|
|
typedef enum {
|
|
PARSE_OK = 0,
|
|
PARSE_ERR = -1,
|
|
PARSE_NEEDMORE = -2,
|
|
} parseResult;
|
|
|
|
#define COMMAND_QUEUE_MIN_CAPACITY 16
|
|
|
|
/* Return the amount of memory used by the sds string at object->ptr
|
|
* for a string object. This includes internal fragmentation. */
|
|
size_t getStringObjectSdsUsedMemory(robj *o) {
|
|
serverAssertWithInfo(NULL, o, o->type == OBJ_STRING);
|
|
if (o->encoding != OBJ_ENCODING_INT) {
|
|
return sdsAllocSize(o->ptr);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* Return the length of a string object.
|
|
* This does NOT include internal fragmentation or sds unused space. */
|
|
size_t getStringObjectLen(robj *o) {
|
|
serverAssertWithInfo(NULL, o, o->type == OBJ_STRING);
|
|
switch (o->encoding) {
|
|
case OBJ_ENCODING_RAW: return sdslen(o->ptr);
|
|
case OBJ_ENCODING_EMBSTR: return sdslen(o->ptr);
|
|
default: return 0; /* Just integer encoding for now. */
|
|
}
|
|
}
|
|
|
|
/* Actual allocated size of a client reply block */
|
|
static size_t clientReplyAllocSize(clientReplyBlock *block) {
|
|
return sizeof(clientReplyBlock) + block->size;
|
|
}
|
|
|
|
/* Client.reply list dup and free methods. */
|
|
void *dupClientReplyValue(void *o) {
|
|
size_t bufsize = clientReplyAllocSize((clientReplyBlock *)o);
|
|
clientReplyBlock *buf = zmalloc(bufsize);
|
|
memcpy(buf, o, bufsize);
|
|
return buf;
|
|
}
|
|
|
|
void freeClientReplyValue(void *o) {
|
|
if (!o) return;
|
|
zfree_with_size(o, clientReplyAllocSize((clientReplyBlock *)o));
|
|
}
|
|
|
|
/* This function links the client to the global linked list of clients.
|
|
* unlinkClient() does the opposite, among other things. */
|
|
void linkClient(client *c) {
|
|
listAddNodeTail(server.clients, c);
|
|
/* Note that we remember the linked list node where the client is stored,
|
|
* this way removing the client in unlinkClient() will not require
|
|
* a linear scan, but just a constant time operation. */
|
|
c->client_list_node = listLast(server.clients);
|
|
uint64_t id = htonu64(c->id);
|
|
raxInsert(server.clients_index, (unsigned char *)&id, sizeof(id), c, NULL);
|
|
}
|
|
|
|
/* Initialize client authentication state. */
|
|
static void clientSetDefaultAuth(client *c) {
|
|
/* If the default user does not require authentication, the user is
|
|
* directly authenticated. */
|
|
clientSetUser(c, DefaultUser, (DefaultUser->flags & USER_FLAG_NOPASS) && !(DefaultUser->flags & USER_FLAG_DISABLED));
|
|
}
|
|
|
|
/* Attach the user u to this client.
|
|
* Also, mark the client authentication state. In case the client is marked as authenticated,
|
|
* it will also set the ever_authenticated flag on the client in order to avoid low level
|
|
* limiting of the client output buffer.*/
|
|
void clientSetUser(client *c, user *u, int authenticated) {
|
|
c->user = u;
|
|
c->flag.authenticated = authenticated;
|
|
if (authenticated)
|
|
c->flag.ever_authenticated = authenticated;
|
|
}
|
|
|
|
static int clientEverAuthenticated(client *c) {
|
|
return c->flag.ever_authenticated;
|
|
}
|
|
|
|
int authRequired(client *c) {
|
|
/* Check if the user is authenticated. This check is skipped in case
|
|
* the default user is flagged as "nopass" and is active. */
|
|
int auth_required = (!(DefaultUser->flags & USER_FLAG_NOPASS) || (DefaultUser->flags & USER_FLAG_DISABLED)) &&
|
|
!c->flag.authenticated;
|
|
return auth_required;
|
|
}
|
|
|
|
static inline int isReplicaReadyForReplData(client *replica) {
|
|
return (replica->repl_data->repl_state == REPLICA_STATE_ONLINE || replica->repl_data->repl_state == REPLICA_STATE_BG_RDB_LOAD) &&
|
|
!(replica->flag.close_asap);
|
|
}
|
|
|
|
/* Decides if copy avoidance is preferred according to client type, number of I/O threads, object size
|
|
* Maybe called with NULL obj for evaluation with no regard to object size
|
|
* Copy avoidance can be allowed only for regular Valkey clients
|
|
* that use _writeToClient handler to write replies to client connection */
|
|
static int isCopyAvoidPreferred(client *c, robj *obj) {
|
|
if (c->flag.fake || isDeferredReplyEnabled(c)) return 0;
|
|
|
|
int type = getClientType(c);
|
|
if (type != CLIENT_TYPE_NORMAL && type != CLIENT_TYPE_PUBSUB) return 0;
|
|
|
|
if (obj) {
|
|
if (obj->encoding != OBJ_ENCODING_RAW) return 0;
|
|
if (obj->refcount == OBJ_STATIC_REFCOUNT) return 0;
|
|
}
|
|
|
|
/* Copy avoidance is preferred for any string size starting certain number of I/O threads */
|
|
if (server.min_io_threads_copy_avoid && server.io_threads_num >= server.min_io_threads_copy_avoid) return 1;
|
|
|
|
if (!obj) return 0;
|
|
|
|
/* Main thread only. No I/O threads */
|
|
if (server.io_threads_num == 1) {
|
|
/* Copy avoidance is preferred starting certain string size */
|
|
return server.min_string_size_copy_avoid && sdslen(obj->ptr) >= (size_t)server.min_string_size_copy_avoid;
|
|
}
|
|
/* Main thread + I/O threads */
|
|
return server.min_string_size_copy_avoid_threaded && sdslen(obj->ptr) >= (size_t)server.min_string_size_copy_avoid_threaded;
|
|
}
|
|
|
|
client *createClient(connection *conn) {
|
|
client *c = zmalloc(sizeof(client));
|
|
|
|
/* passing NULL as conn it is possible to create a non connected client.
|
|
* This is useful since all the commands needs to be executed
|
|
* in the context of a client. When commands are executed in other
|
|
* contexts (for instance a Lua script) we need a non connected client. */
|
|
if (conn) {
|
|
connSetReadHandler(conn, readQueryFromClient);
|
|
connSetPrivateData(conn, c);
|
|
conn->flags |= CONN_FLAG_ALLOW_ACCEPT_OFFLOAD;
|
|
}
|
|
c->buf = zmalloc_usable(PROTO_REPLY_CHUNK_BYTES, &c->buf_usable_size);
|
|
selectDb(c, 0);
|
|
uint64_t client_id = atomic_fetch_add_explicit(&server.next_client_id, 1, memory_order_relaxed);
|
|
c->id = client_id;
|
|
#ifdef LOG_REQ_RES
|
|
reqresReset(c, 0);
|
|
c->resp = server.client_default_resp;
|
|
#else
|
|
c->resp = 2;
|
|
#endif
|
|
c->conn = conn;
|
|
c->name = NULL;
|
|
c->lib_name = NULL;
|
|
c->lib_ver = NULL;
|
|
c->bufpos = 0;
|
|
c->last_header = NULL;
|
|
c->buf_peak = c->buf_usable_size;
|
|
c->buf_peak_last_reset_time = server.unixtime;
|
|
c->qb_pos = 0;
|
|
c->querybuf = NULL;
|
|
c->querybuf_peak = 0;
|
|
c->reqtype = 0;
|
|
c->argc = 0;
|
|
c->argv = NULL;
|
|
c->argv_len = 0;
|
|
c->argv_len_sum = 0;
|
|
c->original_argc = 0;
|
|
c->original_argv = NULL;
|
|
c->nread = 0;
|
|
c->read_flags = 0;
|
|
c->write_flags = 0;
|
|
c->cmd_queue.cmds = NULL;
|
|
c->cmd_queue.len = c->cmd_queue.off = c->cmd_queue.cap = 0;
|
|
c->cmd = c->lastcmd = c->realcmd = c->parsed_cmd = NULL;
|
|
c->cur_script = NULL;
|
|
c->multibulklen = 0;
|
|
c->bulklen = -1;
|
|
c->raw_flag = 0;
|
|
c->capa = 0;
|
|
c->slot = -1;
|
|
c->ctime = c->last_interaction = server.unixtime;
|
|
c->duration = 0;
|
|
clientSetDefaultAuth(c);
|
|
c->slot_migration_job = NULL;
|
|
c->reply = listCreate();
|
|
c->deferred_reply = NULL;
|
|
c->deferred_reply_errors = NULL;
|
|
c->reply_bytes = 0;
|
|
c->deferred_reply_bytes = ULLONG_MAX;
|
|
c->obuf_soft_limit_reached_time = 0;
|
|
listSetFreeMethod(c->reply, freeClientReplyValue);
|
|
listSetDupMethod(c->reply, dupClientReplyValue);
|
|
c->repl_data = NULL;
|
|
c->bstate = NULL;
|
|
c->pubsub_data = NULL;
|
|
c->module_data = NULL;
|
|
c->mstate = NULL;
|
|
c->woff = 0;
|
|
c->peerid = NULL;
|
|
c->sockname = NULL;
|
|
c->client_list_node = NULL;
|
|
c->io_read_state = CLIENT_IDLE;
|
|
c->io_write_state = CLIENT_IDLE;
|
|
c->nwritten = 0;
|
|
c->last_memory_usage = 0;
|
|
c->last_memory_type = CLIENT_TYPE_NORMAL;
|
|
listInitNode(&c->clients_pending_write_node, c);
|
|
listInitNode(&c->pending_read_list_node, c);
|
|
c->mem_usage_bucket = NULL;
|
|
c->mem_usage_bucket_node = NULL;
|
|
if (conn) linkClient(c);
|
|
c->net_input_bytes = 0;
|
|
c->net_input_bytes_curr_cmd = 0;
|
|
c->net_output_bytes = 0;
|
|
c->net_output_bytes_curr_cmd = 0;
|
|
c->commands_processed = 0;
|
|
c->io_last_reply_block = NULL;
|
|
c->io_last_bufpos = 0;
|
|
c->io_last_written.buf = NULL;
|
|
c->io_last_written.bufpos = 0;
|
|
c->io_last_written.data_len = 0;
|
|
return c;
|
|
}
|
|
|
|
void installClientWriteHandler(client *c) {
|
|
int ae_barrier = 0;
|
|
/* For the fsync=always policy, we want that a given FD is never
|
|
* served for reading and writing in the same event loop iteration,
|
|
* so that in the middle of receiving the query, and serving it
|
|
* to the client, we'll call beforeSleep() that will do the
|
|
* actual fsync of AOF to disk. the write barrier ensures that. */
|
|
if (server.aof_state == AOF_ON && server.aof_fsync == AOF_FSYNC_ALWAYS) {
|
|
ae_barrier = 1;
|
|
}
|
|
if (connSetWriteHandlerWithBarrier(c->conn, sendReplyToClient, ae_barrier) == C_ERR) {
|
|
freeClientAsync(c);
|
|
}
|
|
}
|
|
|
|
/* This function puts the client in the queue of clients that should write
|
|
* their output buffers to the socket. Note that it does not *yet* install
|
|
* the write handler, to start clients are put in a queue of clients that need
|
|
* to write, so we try to do that before returning in the event loop (see the
|
|
* handleClientsWithPendingWrites() function).
|
|
* If we fail and there is more data to write, compared to what the socket
|
|
* buffers can hold, then we'll really install the handler. */
|
|
void putClientInPendingWriteQueue(client *c) {
|
|
/* Schedule the client to write the output buffers to the socket only
|
|
* if not already done and, for replicas, if the replica can actually receive
|
|
* writes at this stage. */
|
|
if (!c->flag.pending_write &&
|
|
(!c->repl_data ||
|
|
c->repl_data->repl_state == REPL_STATE_NONE ||
|
|
(isReplicaReadyForReplData(c) && !c->repl_data->repl_start_cmd_stream_on_ack)) &&
|
|
clusterSlotMigrationShouldInstallWriteHandler(c)) {
|
|
/* Here instead of installing the write handler, we just flag the
|
|
* client and put it into a list of clients that have something
|
|
* to write to the socket. This way before re-entering the event
|
|
* loop, we can try to directly write to the client sockets avoiding
|
|
* a system call. We'll only really install the write handler if
|
|
* we'll not be able to write the whole reply at once. */
|
|
c->flag.pending_write = 1;
|
|
listLinkNodeHead(server.clients_pending_write, &c->clients_pending_write_node);
|
|
}
|
|
}
|
|
/* This function is called every time we are going to transmit new data
|
|
* to the client. The behavior is the following:
|
|
*
|
|
* If the client should receive new data (normal clients will) the function
|
|
* returns C_OK, and make sure to install the write handler in our event
|
|
* loop so that when the socket is writable new data gets written.
|
|
*
|
|
* If the client should not receive new data, because it is a fake client
|
|
* (used to load AOF in memory), a primary or because the setup of the write
|
|
* handler failed, the function returns C_ERR.
|
|
*
|
|
* The function may return C_OK without actually installing the write
|
|
* event handler in the following cases:
|
|
*
|
|
* 1) The event handler should already be installed since the output buffer
|
|
* already contains something.
|
|
* 2) The client is a replica but not yet online, so we want to just accumulate
|
|
* writes in the buffer but not actually sending them yet.
|
|
*
|
|
* Typically gets called every time a reply is built, before adding more
|
|
* data to the clients output buffers. If the function returns C_ERR no
|
|
* data should be appended to the output buffers. */
|
|
int prepareClientToWrite(client *c) {
|
|
/* If it's the Lua client we always return ok without installing any
|
|
* handler since there is no socket at all. */
|
|
if (c->flag.script || c->flag.module) return C_OK;
|
|
|
|
/* If CLIENT_CLOSE_ASAP flag is set, we need not write anything. */
|
|
if (c->flag.close_asap) return C_ERR;
|
|
|
|
/* CLIENT REPLY OFF / SKIP handling: don't send replies.
|
|
* CLIENT_PUSHING handling: disables the reply silencing flags. */
|
|
if ((c->flag.reply_off || c->flag.reply_skip) && !c->flag.pushing) return C_ERR;
|
|
|
|
/* Primaries don't receive replies, unless CLIENT_PRIMARY_FORCE_REPLY flag
|
|
* is set. */
|
|
if (c->flag.primary && !c->flag.primary_force_reply) return C_ERR;
|
|
|
|
/* Skip the fake client, such as the fake client for AOF loading.
|
|
* But CLIENT_ID_CACHED_RESPONSE is allowed since it is a fake client
|
|
* but has a connection to cache the response. */
|
|
if (c->flag.fake && c->id != CLIENT_ID_CACHED_RESPONSE) return C_ERR;
|
|
serverAssert(c->conn);
|
|
|
|
/* Schedule the client to write the output buffers to the socket, unless
|
|
* it should already be setup to do so (it has already pending data). */
|
|
if (!clientHasPendingReplies(c)) putClientInPendingWriteQueue(c);
|
|
|
|
if (!isDeferredReplyEnabled(c)) c->flag.buffered_reply = 1;
|
|
/* Authorize the caller to queue in the output buffer of this client. */
|
|
return C_OK;
|
|
}
|
|
|
|
/* Returns everything in the client reply linked list in a SDS format.
|
|
* This should only be used only with a caching client. */
|
|
sds aggregateClientOutputBuffer(client *c) {
|
|
sds cmd_response = sdsempty();
|
|
listIter li;
|
|
listNode *ln;
|
|
clientReplyBlock *val_block;
|
|
listRewind(c->reply, &li);
|
|
|
|
/* Here, c->buf is not used, thus we confirm c->bufpos remains 0. */
|
|
serverAssert(c->bufpos == 0);
|
|
while ((ln = listNext(&li)) != NULL) {
|
|
val_block = (clientReplyBlock *)listNodeValue(ln);
|
|
cmd_response = sdscatlen(cmd_response, val_block->buf, val_block->used);
|
|
}
|
|
return cmd_response;
|
|
}
|
|
|
|
/* This function creates and returns a fake client for recording the command response
|
|
* to initiate caching of any command response.
|
|
*
|
|
* It needs be paired with `deleteCachedResponseClient` function to stop caching. */
|
|
client *createCachedResponseClient(int resp) {
|
|
struct client *recording_client = createClient(NULL);
|
|
/* It is a fake client but with a connection, setting a special client id,
|
|
* so we can identify it's a fake cached response client. */
|
|
recording_client->id = CLIENT_ID_CACHED_RESPONSE;
|
|
recording_client->resp = resp;
|
|
/* Allocating the `conn` allows to prepare the caching client before adding
|
|
* data to the clients output buffer by `prepareClientToWrite`. */
|
|
recording_client->conn = zcalloc(sizeof(connection));
|
|
recording_client->flag.fake = 1;
|
|
return recording_client;
|
|
}
|
|
|
|
/* This function is used to stop caching of any command response after `createCachedResponseClient` is called.
|
|
* It returns the command response as SDS from the recording_client's reply buffer. */
|
|
void deleteCachedResponseClient(client *recording_client) {
|
|
zfree(recording_client->conn);
|
|
recording_client->conn = NULL;
|
|
freeClient(recording_client);
|
|
}
|
|
|
|
/* -----------------------------------------------------------------------------
|
|
* Low level functions to add more data to output buffers.
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
/* Updates an existing header, if possible; otherwise inserts a new one
|
|
* Returns the length of data that can be added to the reply buffer (i.e. min(available, requested)) */
|
|
static size_t upsertPayloadHeader(char *buf, size_t *bufpos, payloadHeader **last_header, uint8_t type, size_t len, int slot, size_t available) {
|
|
/* Enforce min len for BULK_STR_REF chunks as whole pointers must be written to the buffer */
|
|
size_t min_len = (type == BULK_STR_REF ? len : 1);
|
|
if (min_len > available) return 0;
|
|
size_t allowed_len = min(available, len);
|
|
|
|
// If cluster slots stats disabled set slot to -1 to prevent excessive per slot headers
|
|
if (!clusterSlotStatsEnabled(slot)) slot = -1;
|
|
|
|
/* Try to add payload to last chunk if possible */
|
|
if (*last_header != NULL && (*last_header)->payload_type == type && (*last_header)->slot == slot) {
|
|
(*last_header)->payload_len += allowed_len;
|
|
return allowed_len;
|
|
}
|
|
|
|
/* Recheck min len condition and recalculate allowed len with a new header to be added */
|
|
if (sizeof(payloadHeader) + min_len > available) return 0;
|
|
available -= sizeof(payloadHeader);
|
|
if (len > available) allowed_len = available;
|
|
|
|
/* Start a new payload chunk */
|
|
*last_header = (payloadHeader *)(buf + *bufpos);
|
|
|
|
(*last_header)->payload_type = type;
|
|
(*last_header)->payload_len = allowed_len;
|
|
(*last_header)->slot = slot;
|
|
(*last_header)->reply_len = 0;
|
|
|
|
*bufpos += sizeof(payloadHeader);
|
|
|
|
return allowed_len;
|
|
}
|
|
|
|
/* Attempts to add the reply to the static buffer in the client struct.
|
|
* Returns the length of data that is added to the reply buffer.
|
|
*
|
|
* Sanitizer suppression: client->buf_usable_size determined by
|
|
* zmalloc_usable_size() call. Writing beyond client->buf boundaries confuses
|
|
* sanitizer and generates a false positive out-of-bounds error */
|
|
VALKEY_NO_SANITIZE("bounds")
|
|
static size_t _addReplyPayloadToBuffer(client *c, const void *payload, size_t len, uint8_t payload_type) {
|
|
/* If the debug enforcing to use the reply list is enabled.*/
|
|
if (server.debug_client_enforce_reply_list) return 0;
|
|
/* If there already are entries in the reply list, we cannot
|
|
* add anything more to the static buffer. */
|
|
if (listLength(c->reply) > 0) return 0;
|
|
|
|
size_t available = c->buf_usable_size - c->bufpos;
|
|
size_t reply_len = min(available, len);
|
|
if (c->flag.buf_encoded) {
|
|
reply_len = upsertPayloadHeader(c->buf, &c->bufpos, &c->last_header, payload_type, len, c->slot, available);
|
|
}
|
|
if (!reply_len) return 0;
|
|
|
|
memcpy(c->buf + c->bufpos, payload, reply_len);
|
|
c->bufpos += reply_len;
|
|
/* We update the buffer peak after appending the reply to the buffer */
|
|
if (c->buf_peak < (size_t)c->bufpos) c->buf_peak = (size_t)c->bufpos;
|
|
return reply_len;
|
|
}
|
|
|
|
static size_t _addReplyToBuffer(client *c, const char *s, size_t len) {
|
|
if (!len) return 0;
|
|
if (!c->bufpos) {
|
|
c->flag.buf_encoded = isCopyAvoidPreferred(c, NULL);
|
|
}
|
|
return _addReplyPayloadToBuffer(c, s, len, PLAIN_REPLY);
|
|
}
|
|
|
|
/* Adds bulk string reference (i.e. pointer to object and pointer to string itself) to static buffer
|
|
* Returns non-zero value if succeeded to add */
|
|
static size_t _addBulkStrRefToBuffer(client *c, const void *payload, size_t len) {
|
|
if (!c->flag.buf_encoded) {
|
|
/* If buffer is plain and not empty then can't add bulk string reference to it */
|
|
if (c->bufpos) return 0;
|
|
c->flag.buf_encoded = 1;
|
|
}
|
|
return _addReplyPayloadToBuffer(c, payload, len, BULK_STR_REF);
|
|
}
|
|
|
|
/* Adds the payload to the reply linked list.
|
|
* Note: some edits to this function need to be relayed to AddReplyFromClient. */
|
|
static void _addReplyPayloadToList(client *c, list *reply_list, const char *payload, size_t len, uint8_t payload_type) {
|
|
listNode *ln = listLast(reply_list);
|
|
clientReplyBlock *tail = ln ? listNodeValue(ln) : NULL;
|
|
/* Determine if encoded buffer is required */
|
|
int encoded = payload_type == BULK_STR_REF || isCopyAvoidPreferred(c, NULL);
|
|
|
|
/* Note that 'tail' may be NULL even if we have a tail node, because when
|
|
* addReplyDeferredLen() is used, it sets a dummy node to NULL just
|
|
* to fill it later, when the size of the bulk length is set. */
|
|
|
|
/* Append to tail string when possible. */
|
|
if (tail) {
|
|
/* Copy the part we can fit into the tail, and leave the rest for a
|
|
* new node */
|
|
size_t avail = tail->size - tail->used;
|
|
size_t copy = avail >= len ? len : avail;
|
|
|
|
if (tail->flag.buf_encoded) {
|
|
copy = upsertPayloadHeader(tail->buf, &tail->used, &tail->last_header, payload_type, len, c->slot, avail);
|
|
} else if (encoded) {
|
|
/* If encoded buffer is required but tail is unencoded then pretend nothing can be added to it
|
|
* and, as consequence, cause addition of a new tail */
|
|
copy = 0;
|
|
}
|
|
|
|
if (copy) {
|
|
memcpy(tail->buf + tail->used, payload, copy);
|
|
tail->used += copy;
|
|
payload += copy;
|
|
len -= copy;
|
|
}
|
|
}
|
|
if (len) {
|
|
/* Create a new node, make sure it is allocated to at
|
|
* least PROTO_REPLY_CHUNK_BYTES */
|
|
size_t usable_size;
|
|
size_t min_reply_size = isDeferredReplyEnabled(c) ? PROTO_REPLY_MIN_BYTES : PROTO_REPLY_CHUNK_BYTES;
|
|
size_t required_size = encoded ? len + sizeof(payloadHeader) : len;
|
|
size_t size = required_size < min_reply_size ? min_reply_size : required_size;
|
|
tail = zmalloc_usable(size + sizeof(clientReplyBlock), &usable_size);
|
|
/* take over the allocation's internal fragmentation */
|
|
tail->size = usable_size - sizeof(clientReplyBlock);
|
|
tail->used = 0;
|
|
tail->flag.buf_encoded = encoded;
|
|
tail->last_header = NULL;
|
|
if (tail->flag.buf_encoded) {
|
|
upsertPayloadHeader(tail->buf, &tail->used, &tail->last_header, payload_type, len, c->slot, tail->size);
|
|
}
|
|
memcpy(tail->buf + tail->used, payload, len);
|
|
tail->used += len;
|
|
listAddNodeTail(reply_list, tail);
|
|
unsigned long long *reply_bytes = (isDeferredReplyEnabled(c)) ? &c->deferred_reply_bytes : &c->reply_bytes;
|
|
*reply_bytes += tail->size;
|
|
|
|
closeClientOnOutputBufferLimitReached(c, 1);
|
|
}
|
|
}
|
|
|
|
void _addReplyProtoToList(client *c, list *reply_list, const char *s, size_t len) {
|
|
if (!len) return;
|
|
_addReplyPayloadToList(c, reply_list, s, len, PLAIN_REPLY);
|
|
}
|
|
|
|
/* Adds bulk string reference (i.e. pointer to object and pointer to string itself) to reply list */
|
|
static void _addBulkStrRefToToList(client *c, const void *payload, size_t len) {
|
|
_addReplyPayloadToList(c, c->reply, payload, len, BULK_STR_REF);
|
|
}
|
|
|
|
/* The subscribe / unsubscribe command family has a push as a reply,
|
|
* or in other words, it responds with a push (or several of them
|
|
* depending on how many arguments it got), and has no reply. */
|
|
int cmdHasPushAsReply(struct serverCommand *cmd) {
|
|
if (!cmd) return 0;
|
|
return cmd->proc == subscribeCommand || cmd->proc == unsubscribeCommand || cmd->proc == psubscribeCommand ||
|
|
cmd->proc == punsubscribeCommand || cmd->proc == ssubscribeCommand || cmd->proc == sunsubscribeCommand;
|
|
}
|
|
|
|
void _addReplyToBufferOrList(client *c, const char *s, size_t len) {
|
|
if (c->flag.close_after_reply) return;
|
|
|
|
/* Replicas should normally not cause any writes to the reply buffer. In case a rogue replica sent a command on the
|
|
* replication link that caused a reply to be generated we'll simply disconnect it.
|
|
* Note this is the simplest way to check a command added a response. Replication links are used to write data but
|
|
* not for responses, so we should normally never get here on a replica client. */
|
|
if (getClientType(c) == CLIENT_TYPE_REPLICA) {
|
|
sds cmdname = c->lastcmd ? c->lastcmd->fullname : NULL;
|
|
logInvalidUseAndFreeClientAsync(c, "Replica generated a reply to command '%s'",
|
|
cmdname ? cmdname : "<unknown>");
|
|
return;
|
|
}
|
|
|
|
c->net_output_bytes_curr_cmd += len;
|
|
/* We call it here because this function may affect the reply
|
|
* buffer offset (see function comment) */
|
|
reqresSaveClientReplyOffset(c);
|
|
|
|
/* If we're processing a push message into the current client (i.e. executing PUBLISH
|
|
* to a channel which we are subscribed to, then we wanna postpone that message to be added
|
|
* after the command's reply (specifically important during multi-exec). the exception is
|
|
* the SUBSCRIBE command family, which (currently) have a push message instead of a proper reply.
|
|
* The check for executing_client also avoids affecting push messages that are part of eviction.
|
|
* Check CLIENT_PUSHING first to avoid race conditions, as it's absent in module's fake client. */
|
|
int defer_push_message = c->flag.pushing && c == server.current_client && server.executing_client &&
|
|
!cmdHasPushAsReply(server.executing_client->cmd);
|
|
if (defer_push_message == 0 && isDeferredReplyEnabled(c)) {
|
|
_addReplyProtoToList(c, c->deferred_reply, s, len);
|
|
return;
|
|
}
|
|
|
|
if (defer_push_message) {
|
|
_addReplyProtoToList(c, server.pending_push_messages, s, len);
|
|
return;
|
|
}
|
|
size_t reply_len = _addReplyToBuffer(c, s, len);
|
|
if (len > reply_len) _addReplyProtoToList(c, c->reply, s + reply_len, len - reply_len);
|
|
}
|
|
|
|
/* Increment reference to object and add pointer to object and
|
|
* pointer to string itself to current reply buffer */
|
|
static void _addBulkStrRefToBufferOrList(client *c, robj *obj) {
|
|
if (c->flag.close_after_reply) return;
|
|
|
|
/* Refcount will be decremented in write completion handler by the main thread */
|
|
incrRefCount(obj);
|
|
|
|
bulkStrRef str_ref = {.obj = obj, .str = obj->ptr};
|
|
if (!_addBulkStrRefToBuffer(c, (void *)&str_ref, sizeof(str_ref))) {
|
|
_addBulkStrRefToToList(c, (void *)&str_ref, sizeof(str_ref));
|
|
}
|
|
}
|
|
|
|
/* -----------------------------------------------------------------------------
|
|
* Higher level functions to queue data on the client output buffer.
|
|
* The following functions are the ones that commands implementations will call.
|
|
* -------------------------------------------------------------------------- */
|
|
|
|
/* Add the object 'obj' string representation to the client output buffer. */
|
|
void addReply(client *c, robj *obj) {
|
|
if (prepareClientToWrite(c) != C_OK) return;
|
|
|
|
if (sdsEncodedObject(obj)) {
|
|
_addReplyToBufferOrList(c, obj->ptr, sdslen(obj->ptr));
|
|
} else if (obj->encoding == OBJ_ENCODING_INT) {
|
|
/* For integer encoded strings we just convert it into a string
|
|
* using our optimized function, and attach the resulting string
|
|
* to the output buffer. */
|
|
char buf[32];
|
|
size_t len = ll2string(buf, sizeof(buf), (long)obj->ptr);
|
|
_addReplyToBufferOrList(c, buf, len);
|
|
} else {
|
|
serverPanic("Wrong obj->encoding in addReply()");
|
|
}
|
|
}
|
|
|
|
/* Add the SDS 's' string to the client output buffer, as a side effect
|
|
* the SDS string is freed. */
|
|
void addReplySds(client *c, sds s) {
|
|
if (prepareClientToWrite(c) != C_OK) {
|
|
/* The caller expects the sds to be free'd. */
|
|
sdsfree(s);
|
|
return;
|
|
}
|
|
_addReplyToBufferOrList(c, s, sdslen(s));
|
|
sdsfree(s);
|
|
}
|
|
|
|
/* This low level function just adds whatever protocol you send it to the
|
|
* client buffer, trying the static buffer initially, and using the string
|
|
* of objects if not possible.
|
|
*
|
|
* It is efficient because does not create an SDS object nor an Object
|
|
* if not needed. The object will only be created by calling
|
|
* _addReplyProtoToList() if we fail to extend the existing tail object
|
|
* in the list of objects. */
|
|
void addReplyProto(client *c, const char *s, size_t len) {
|
|
if (prepareClientToWrite(c) != C_OK) return;
|
|
_addReplyToBufferOrList(c, s, len);
|
|
}
|
|
|
|
/* Low level function called by the addReplyError...() functions.
|
|
* It emits the protocol for an error reply, in the form:
|
|
*
|
|
* -ERRORCODE Error Message<CR><LF>
|
|
*
|
|
* If the error code is already passed in the string 's', the error
|
|
* code provided is used, otherwise the string "-ERR " for the generic
|
|
* error code is automatically added.
|
|
* Note that 's' must NOT end with \r\n. */
|
|
void addReplyErrorLength(client *c, const char *s, size_t len) {
|
|
/* If the string already starts with "-..." then the error code
|
|
* is provided by the caller. Otherwise we use "-ERR". */
|
|
if (!len || s[0] != '-') addReplyProto(c, "-ERR ", 5);
|
|
addReplyProto(c, s, len);
|
|
addReplyProto(c, "\r\n", 2);
|
|
}
|
|
|
|
/* Do some actions after an error reply was sent (Log if needed, updates stats, etc.)
|
|
* Possible flags:
|
|
* * ERR_REPLY_FLAG_NO_STATS_UPDATE - indicate not to update any error stats. */
|
|
void afterErrorReply(client *c, const char *s, size_t len, int flags) {
|
|
/* Module clients fall into two categories:
|
|
* Calls to RM_Call, in which case the error isn't being returned to a client, so should not be counted.
|
|
* Module thread safe context calls to RM_ReplyWithError, which will be added to a real client by the main thread
|
|
* later. */
|
|
if (c->flag.module) {
|
|
if (!c->deferred_reply_errors) {
|
|
c->deferred_reply_errors = listCreate();
|
|
listSetFreeMethod(c->deferred_reply_errors, sdsfreeVoid);
|
|
}
|
|
listAddNodeTail(c->deferred_reply_errors, sdsnewlen(s, len));
|
|
return;
|
|
}
|
|
|
|
commitDeferredReplyBuffer(c, 1);
|
|
if (!(flags & ERR_REPLY_FLAG_NO_STATS_UPDATE)) {
|
|
/* Increment the global error counter */
|
|
server.stat_total_error_replies++;
|
|
/* Increment the error stats
|
|
* If the string already starts with "-..." then the error prefix
|
|
* is provided by the caller (we limit the search to 32 chars). Otherwise we use "-ERR". */
|
|
char *err_prefix = "ERR";
|
|
size_t prefix_len = 3;
|
|
if (s[0] == '-') {
|
|
const char *spaceloc = memchr(s, ' ', len < 32 ? len : 32);
|
|
/* If we cannot retrieve the error prefix, use the default: "ERR". */
|
|
if (spaceloc) {
|
|
const size_t errEndPos = (size_t)(spaceloc - s);
|
|
err_prefix = (char *)s + 1;
|
|
prefix_len = errEndPos - 1;
|
|
}
|
|
}
|
|
/* After the errors RAX reaches its limit, instead of tracking
|
|
* custom errors (e.g. LUA), we track the error under `errorstat_ERRORSTATS_OVERFLOW` */
|
|
if (flags & ERR_REPLY_FLAG_CUSTOM && raxSize(server.errors) >= ERRORSTATS_LIMIT &&
|
|
!raxFind(server.errors, (unsigned char *)err_prefix, prefix_len, NULL)) {
|
|
err_prefix = ERRORSTATS_OVERFLOW_ERR;
|
|
prefix_len = strlen(ERRORSTATS_OVERFLOW_ERR);
|
|
}
|
|
incrementErrorCount(err_prefix, prefix_len);
|
|
} else {
|
|
/* stat_total_error_replies will not be updated, which means that
|
|
* the cmd stats will not be updated as well, we still want this command
|
|
* to be counted as failed so we update it here. We update c->realcmd in
|
|
* case c->cmd was changed (like in GEOADD). */
|
|
c->realcmd->failed_calls++;
|
|
}
|
|
|
|
/* Sometimes it could be normal that a replica replies to a primary with
|
|
* an error and this function gets called. Actually the error will never
|
|
* be sent because addReply*() against primary clients has no effect...
|
|
*
|
|
* It can happen when the versions are different and replica cannot recognize
|
|
* the commands sent by the primary. However it is useful to log such events since
|
|
* they are rare and may hint at errors in a script or a bug in the server. */
|
|
int ctype = getClientType(c);
|
|
if (ctype == CLIENT_TYPE_PRIMARY || ctype == CLIENT_TYPE_REPLICA || c->id == CLIENT_ID_AOF || ctype == CLIENT_TYPE_SLOT_IMPORT || ctype == CLIENT_TYPE_SLOT_EXPORT) {
|
|
char *to, *from;
|
|
|
|
if (c->id == CLIENT_ID_AOF) {
|
|
to = "AOF-loading-client";
|
|
from = "server";
|
|
} else if (ctype == CLIENT_TYPE_PRIMARY) {
|
|
to = "primary";
|
|
from = "replica";
|
|
} else if (ctype == CLIENT_TYPE_REPLICA) {
|
|
to = "replica";
|
|
from = "primary";
|
|
} else if (ctype == CLIENT_TYPE_SLOT_IMPORT) {
|
|
to = "slot-import-source";
|
|
from = "slot-import-target";
|
|
} else if (ctype == CLIENT_TYPE_SLOT_EXPORT) {
|
|
to = "slot-export-target";
|
|
from = "slot-export-source";
|
|
} else {
|
|
serverAssert(0);
|
|
}
|
|
|
|
if (len > 4096) len = 4096;
|
|
sds cmdname = c->lastcmd ? c->lastcmd->fullname : NULL;
|
|
serverLog(LL_WARNING,
|
|
"== CRITICAL == This %s is sending an error "
|
|
"to its %s: '%.*s' after processing the command "
|
|
"'%s'",
|
|
from, to, (int)len, s, cmdname ? cmdname : "<unknown>");
|
|
if (ctype == CLIENT_TYPE_PRIMARY && server.repl_backlog && server.repl_backlog->histlen > 0) {
|
|
showLatestBacklog();
|
|
}
|
|
server.stat_unexpected_error_replies++;
|
|
|
|
/* Based off the propagation error behavior, check if we need to panic here. There
|
|
* are currently two checked cases:
|
|
* * If this command was from our primary and we are not a writable replica.
|
|
* * We are reading from an AOF file. */
|
|
int panic_in_replicas = (ctype == CLIENT_TYPE_PRIMARY && server.repl_replica_ro) &&
|
|
(server.propagation_error_behavior == PROPAGATION_ERR_BEHAVIOR_PANIC ||
|
|
server.propagation_error_behavior == PROPAGATION_ERR_BEHAVIOR_PANIC_ON_REPLICAS);
|
|
int panic_in_aof =
|
|
c->id == CLIENT_ID_AOF && server.propagation_error_behavior == PROPAGATION_ERR_BEHAVIOR_PANIC;
|
|
if (panic_in_replicas || panic_in_aof) {
|
|
serverPanic("This %s panicked sending an error to its %s"
|
|
" after processing the command '%s'",
|
|
from, to, cmdname ? cmdname : "<unknown>");
|
|
}
|
|
if (ctype == CLIENT_TYPE_SLOT_IMPORT || ctype == CLIENT_TYPE_SLOT_EXPORT) {
|
|
clusterHandleSlotMigrationErrorResponse(c->slot_migration_job);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* The 'err' object is expected to start with -ERRORCODE and end with \r\n.
|
|
* Unlike addReplyErrorSds and others alike which rely on addReplyErrorLength. */
|
|
void addReplyErrorObject(client *c, robj *err) {
|
|
addReply(c, err);
|
|
afterErrorReply(c, err->ptr, sdslen(err->ptr) - 2, 0); /* Ignore trailing \r\n */
|
|
}
|
|
|
|
/* Sends either a reply or an error reply by checking the first char.
|
|
* If the first char is '-' the reply is considered an error.
|
|
* In any case the given reply is sent, if the reply is also recognize
|
|
* as an error we also perform some post reply operations such as
|
|
* logging and stats update. */
|
|
void addReplyOrErrorObject(client *c, robj *reply) {
|
|
serverAssert(sdsEncodedObject(reply));
|
|
sds rep = reply->ptr;
|
|
if (sdslen(rep) > 1 && rep[0] == '-') {
|
|
addReplyErrorObject(c, reply);
|
|
} else {
|
|
addReply(c, reply);
|
|
}
|
|
}
|
|
|
|
/* See addReplyErrorLength for expectations from the input string. */
|
|
void addReplyError(client *c, const char *err) {
|
|
addReplyErrorLength(c, err, strlen(err));
|
|
afterErrorReply(c, err, strlen(err), 0);
|
|
}
|
|
|
|
/* Add error reply to the given client.
|
|
* Supported flags:
|
|
* * ERR_REPLY_FLAG_NO_STATS_UPDATE - indicate not to perform any error stats updates */
|
|
void addReplyErrorSdsEx(client *c, sds err, int flags) {
|
|
addReplyErrorLength(c, err, sdslen(err));
|
|
afterErrorReply(c, err, sdslen(err), flags);
|
|
sdsfree(err);
|
|
}
|
|
|
|
/* See addReplyErrorLength for expectations from the input string. */
|
|
/* As a side effect the SDS string is freed. */
|
|
void addReplyErrorSds(client *c, sds err) {
|
|
addReplyErrorSdsEx(c, err, 0);
|
|
}
|
|
|
|
/* See addReplyErrorLength for expectations from the input string. */
|
|
/* As a side effect the SDS string is freed. */
|
|
void addReplyErrorSdsSafe(client *c, sds err) {
|
|
err = sdsmapchars(err, "\r\n", " ", 2);
|
|
addReplyErrorSdsEx(c, err, 0);
|
|
}
|
|
|
|
/* Internal function used by addReplyErrorFormat, addReplyErrorFormatEx and RM_ReplyWithErrorFormat.
|
|
* Refer to afterErrorReply for more information about the flags. */
|
|
void addReplyErrorFormatInternal(client *c, int flags, const char *fmt, va_list ap) {
|
|
va_list cpy;
|
|
va_copy(cpy, ap);
|
|
sds s = sdscatvprintf(sdsempty(), fmt, cpy);
|
|
va_end(cpy);
|
|
/* Trim any newlines at the end (ones will be added by addReplyErrorLength) */
|
|
s = sdstrim(s, "\r\n");
|
|
/* Make sure there are no newlines in the middle of the string, otherwise
|
|
* invalid protocol is emitted. */
|
|
s = sdsmapchars(s, "\r\n", " ", 2);
|
|
addReplyErrorLength(c, s, sdslen(s));
|
|
afterErrorReply(c, s, sdslen(s), flags);
|
|
sdsfree(s);
|
|
}
|
|
|
|
void addReplyErrorFormatEx(client *c, int flags, const char *fmt, ...) {
|
|
va_list ap;
|
|
va_start(ap, fmt);
|
|
addReplyErrorFormatInternal(c, flags, fmt, ap);
|
|
va_end(ap);
|
|
}
|
|
|
|
/* See addReplyErrorLength for expectations from the formatted string.
|
|
* The formatted string is safe to contain \r and \n anywhere. */
|
|
void addReplyErrorFormat(client *c, const char *fmt, ...) {
|
|
va_list ap;
|
|
va_start(ap, fmt);
|
|
addReplyErrorFormatInternal(c, 0, fmt, ap);
|
|
va_end(ap);
|
|
}
|
|
|
|
void addReplyErrorArity(client *c) {
|
|
addReplyErrorFormat(c, "wrong number of arguments for '%s' command", c->cmd->fullname);
|
|
}
|
|
|
|
void addReplyErrorExpireTime(client *c) {
|
|
addReplyErrorFormat(c, "invalid expire time in '%s' command", c->cmd->fullname);
|
|
}
|
|
|
|
void addReplyStatusLength(client *c, const char *s, size_t len) {
|
|
addReplyProto(c, "+", 1);
|
|
addReplyProto(c, s, len);
|
|
addReplyProto(c, "\r\n", 2);
|
|
}
|
|
|
|
void addReplyStatus(client *c, const char *status) {
|
|
addReplyStatusLength(c, status, strlen(status));
|
|
}
|
|
|
|
void addReplyStatusFormat(client *c, const char *fmt, ...) {
|
|
va_list ap;
|
|
va_start(ap, fmt);
|
|
sds s = sdscatvprintf(sdsempty(), fmt, ap);
|
|
va_end(ap);
|
|
addReplyStatusLength(c, s, sdslen(s));
|
|
sdsfree(s);
|
|
}
|
|
|
|
/* Sometimes we are forced to create a new reply node, and we can't append to
|
|
* the previous one, when that happens, we wanna try to trim the unused space
|
|
* at the end of the last reply node which we won't use anymore. */
|
|
void trimReplyUnusedTailSpace(client *c) {
|
|
listNode *ln = listLast(c->reply);
|
|
clientReplyBlock *tail = ln ? listNodeValue(ln) : NULL;
|
|
|
|
/* Note that 'tail' may be NULL even if we have a tail node, because when
|
|
* addReplyDeferredLen() is used */
|
|
if (!tail) return;
|
|
|
|
/* We only try to trim the space is relatively high (more than a 1/4 of the
|
|
* allocation), otherwise there's a high chance realloc will NOP.
|
|
* Also, to avoid large memmove which happens as part of realloc, we only do
|
|
* that if the used part is small. */
|
|
if (tail->size - tail->used > tail->size / 4 && tail->used < PROTO_REPLY_CHUNK_BYTES &&
|
|
c->io_write_state != CLIENT_PENDING_IO && !tail->flag.buf_encoded) {
|
|
size_t usable_size;
|
|
size_t old_size = tail->size;
|
|
tail = zrealloc_usable(tail, tail->used + sizeof(clientReplyBlock), &usable_size);
|
|
/* take over the allocation's internal fragmentation (at least for
|
|
* memory usage tracking) */
|
|
tail->size = usable_size - sizeof(clientReplyBlock);
|
|
c->reply_bytes = c->reply_bytes + tail->size - old_size;
|
|
listNodeValue(ln) = tail;
|
|
}
|
|
}
|
|
|
|
/* Adds an empty object to the reply list that will contain the multi bulk
|
|
* length, which is not known when this function is called. */
|
|
void *addReplyDeferredLen(client *c) {
|
|
/* Note that we install the write event here even if the object is not
|
|
* ready to be sent, since we are sure that before returning to the
|
|
* event loop setDeferredAggregateLen() will be called. */
|
|
if (prepareClientToWrite(c) != C_OK) return NULL;
|
|
|
|
/* Replicas should normally not cause any writes to the reply buffer. In case a rogue replica sent a command on the
|
|
* replication link that caused a reply to be generated we'll simply disconnect it.
|
|
* Note this is the simplest way to check a command added a response. Replication links are used to write data but
|
|
* not for responses, so we should normally never get here on a replica client. */
|
|
if (getClientType(c) == CLIENT_TYPE_REPLICA) {
|
|
sds cmdname = c->lastcmd ? c->lastcmd->fullname : NULL;
|
|
logInvalidUseAndFreeClientAsync(c, "Replica generated a reply to command '%s'",
|
|
cmdname ? cmdname : "<unknown>");
|
|
return NULL;
|
|
}
|
|
|
|
/* We call it here because this function conceptually affects the reply
|
|
* buffer offset (see function comment) */
|
|
reqresSaveClientReplyOffset(c);
|
|
|
|
trimReplyUnusedTailSpace(c);
|
|
listAddNodeTail(c->reply, NULL); /* NULL is our placeholder. */
|
|
return listLast(c->reply);
|
|
}
|
|
|
|
void setDeferredReply(client *c, void *node, const char *s, size_t length) {
|
|
listNode *ln = (listNode *)node;
|
|
clientReplyBlock *next, *prev;
|
|
|
|
/* Abort when *node is NULL: when the client should not accept writes
|
|
* we return NULL in addReplyDeferredLen() */
|
|
if (node == NULL) return;
|
|
serverAssert(!listNodeValue(ln));
|
|
|
|
/* Normally we fill this dummy NULL node, added by addReplyDeferredLen(),
|
|
* with a new buffer structure containing the protocol needed to specify
|
|
* the length of the array following. However sometimes there might be room
|
|
* in the previous/next node so we can instead remove this NULL node, and
|
|
* suffix/prefix our data in the node immediately before/after it, in order
|
|
* to save a write(2) syscall later. Conditions needed to do it:
|
|
*
|
|
* - The prev node is non-NULL and has space in it or
|
|
* - The next node is non-NULL,
|
|
* - It has enough room already allocated
|
|
* - And not too large (avoid large memmove)
|
|
* - And the client is not in a pending I/O state */
|
|
if (ln->prev != NULL && (prev = listNodeValue(ln->prev)) && prev->size > prev->used &&
|
|
c->io_write_state != CLIENT_PENDING_IO && !prev->flag.buf_encoded) {
|
|
size_t len_to_copy = prev->size - prev->used;
|
|
if (len_to_copy > length) len_to_copy = length;
|
|
memcpy(prev->buf + prev->used, s, len_to_copy);
|
|
c->net_output_bytes_curr_cmd += len_to_copy;
|
|
prev->used += len_to_copy;
|
|
length -= len_to_copy;
|
|
if (length == 0) {
|
|
listDelNode(c->reply, ln);
|
|
return;
|
|
}
|
|
s += len_to_copy;
|
|
}
|
|
|
|
if (ln->next != NULL && (next = listNodeValue(ln->next)) && next->size - next->used >= length &&
|
|
next->used < PROTO_REPLY_CHUNK_BYTES * 4 && c->io_write_state != CLIENT_PENDING_IO && !next->flag.buf_encoded) {
|
|
memmove(next->buf + length, next->buf, next->used);
|
|
memcpy(next->buf, s, length);
|
|
c->net_output_bytes_curr_cmd += length;
|
|
next->used += length;
|
|
listDelNode(c->reply, ln);
|
|
} else {
|
|
/* Create a new node */
|
|
size_t usable_size;
|
|
clientReplyBlock *buf = zmalloc_usable(length + sizeof(clientReplyBlock), &usable_size);
|
|
/* Take over the allocation's internal fragmentation */
|
|
buf->size = usable_size - sizeof(clientReplyBlock);
|
|
buf->used = length;
|
|
buf->flag.buf_encoded = 0;
|
|
memcpy(buf->buf, s, length);
|
|
c->net_output_bytes_curr_cmd += length;
|
|
listNodeValue(ln) = buf;
|
|
c->reply_bytes += buf->size;
|
|
|
|
closeClientOnOutputBufferLimitReached(c, 1);
|
|
}
|
|
}
|
|
|
|
/* Populate the length object and try gluing it to the next chunk. */
|
|
void setDeferredAggregateLen(client *c, void *node, long length, char prefix) {
|
|
serverAssert(length >= 0);
|
|
|
|
/* Abort when *node is NULL: when the client should not accept writes
|
|
* we return NULL in addReplyDeferredLen() */
|
|
if (node == NULL) return;
|
|
|
|
/* Things like *2\r\n, %3\r\n or ~4\r\n are emitted very often by the protocol
|
|
* so we have a few shared objects to use if the integer is small
|
|
* like it is most of the times. */
|
|
const size_t hdr_len = OBJ_SHARED_HDR_STRLEN(length);
|
|
const int opt_hdr = length < OBJ_SHARED_BULKHDR_LEN;
|
|
if (prefix == '*' && opt_hdr) {
|
|
setDeferredReply(c, node, shared.mbulkhdr[length]->ptr, hdr_len);
|
|
return;
|
|
}
|
|
if (prefix == '%' && opt_hdr) {
|
|
setDeferredReply(c, node, shared.maphdr[length]->ptr, hdr_len);
|
|
return;
|
|
}
|
|
if (prefix == '~' && opt_hdr) {
|
|
setDeferredReply(c, node, shared.sethdr[length]->ptr, hdr_len);
|
|
return;
|
|
}
|
|
|
|
char lenstr[128];
|
|
lenstr[0] = prefix;
|
|
size_t lenstr_len = ll2string(lenstr + 1, sizeof(lenstr) - 1, length);
|
|
lenstr[lenstr_len + 1] = '\r';
|
|
lenstr[lenstr_len + 2] = '\n';
|
|
setDeferredReply(c, node, lenstr, lenstr_len + 3);
|
|
}
|
|
|
|
void setDeferredArrayLen(client *c, void *node, long length) {
|
|
setDeferredAggregateLen(c, node, length, '*');
|
|
}
|
|
|
|
void setDeferredMapLen(client *c, void *node, long length) {
|
|
int prefix = c->resp == 2 ? '*' : '%';
|
|
if (c->resp == 2) length *= 2;
|
|
setDeferredAggregateLen(c, node, length, prefix);
|
|
}
|
|
|
|
void setDeferredSetLen(client *c, void *node, long length) {
|
|
int prefix = c->resp == 2 ? '*' : '~';
|
|
setDeferredAggregateLen(c, node, length, prefix);
|
|
}
|
|
|
|
void setDeferredAttributeLen(client *c, void *node, long length) {
|
|
serverAssert(c->resp >= 3);
|
|
setDeferredAggregateLen(c, node, length, '|');
|
|
}
|
|
|
|
void setDeferredPushLen(client *c, void *node, long length) {
|
|
serverAssert(c->resp >= 3);
|
|
setDeferredAggregateLen(c, node, length, '>');
|
|
}
|
|
|
|
/* Prepare a client for future writes. This is used so that we can
|
|
* skip a large number of calls to prepareClientToWrite when
|
|
* a command produces a lot of discrete elements in its output. */
|
|
writePreparedClient *prepareClientForFutureWrites(client *c) {
|
|
if (prepareClientToWrite(c) == C_OK) {
|
|
return (writePreparedClient *)c;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/* Add a double as a bulk reply */
|
|
void addReplyDouble(client *c, double d) {
|
|
if (c->resp == 3) {
|
|
char dbuf[MAX_D2STRING_CHARS + 3];
|
|
dbuf[0] = ',';
|
|
const int dlen = d2string(dbuf + 1, sizeof(dbuf) - 1, d);
|
|
dbuf[dlen + 1] = '\r';
|
|
dbuf[dlen + 2] = '\n';
|
|
dbuf[dlen + 3] = '\0';
|
|
addReplyProto(c, dbuf, dlen + 3);
|
|
} else {
|
|
char dbuf[MAX_LONG_DOUBLE_CHARS + 32];
|
|
/* In order to prepend the string length before the formatted number,
|
|
* but still avoid an extra memcpy of the whole number, we reserve space
|
|
* for maximum header `$0000\r\n`, print double, add the resp header in
|
|
* front of it, and then send the buffer with the right `start` offset. */
|
|
const int dlen = d2string(dbuf + 7, sizeof(dbuf) - 7, d);
|
|
int digits = digits10(dlen);
|
|
int start = 4 - digits;
|
|
serverAssert(start >= 0);
|
|
dbuf[start] = '$';
|
|
|
|
/* Convert `dlen` to string, putting it's digits after '$' and before the
|
|
* formatted double string. */
|
|
for (int i = digits, val = dlen; val && i > 0; --i, val /= 10) {
|
|
dbuf[start + i] = "0123456789"[val % 10];
|
|
}
|
|
dbuf[5] = '\r';
|
|
dbuf[6] = '\n';
|
|
dbuf[dlen + 7] = '\r';
|
|
dbuf[dlen + 8] = '\n';
|
|
dbuf[dlen + 9] = '\0';
|
|
addReplyProto(c, dbuf + start, dlen + 9 - start);
|
|
}
|
|
}
|
|
|
|
void addReplyBigNum(client *c, const char *num, size_t len) {
|
|
if (c->resp == 2) {
|
|
addReplyBulkCBuffer(c, num, len);
|
|
} else {
|
|
addReplyProto(c, "(", 1);
|
|
addReplyProto(c, num, len);
|
|
addReplyProto(c, "\r\n", 2);
|
|
}
|
|
}
|
|
|
|
/* Add a long double as a bulk reply, but uses a human readable formatting
|
|
* of the double instead of exposing the crude behavior of doubles to the
|
|
* dear user. */
|
|
void addReplyHumanLongDouble(client *c, long double d) {
|
|
if (c->resp == 2) {
|
|
robj *o = createStringObjectFromLongDouble(d, 1);
|
|
addReplyBulk(c, o);
|
|
decrRefCount(o);
|
|
} else {
|
|
char buf[MAX_LONG_DOUBLE_CHARS];
|
|
int len = ld2string(buf, sizeof(buf), d, LD_STR_HUMAN);
|
|
addReplyProto(c, ",", 1);
|
|
addReplyProto(c, buf, len);
|
|
addReplyProto(c, "\r\n", 2);
|
|
}
|
|
}
|
|
|
|
/* Add a long long as integer reply or bulk len / multi bulk count.
|
|
* Basically this is used to output <prefix><long long><crlf>. */
|
|
static void _addReplyLongLongWithPrefix(client *c, long long ll, char prefix) {
|
|
char buf[128];
|
|
int len;
|
|
|
|
/* Things like $3\r\n or *2\r\n are emitted very often by the protocol
|
|
* so we have a few shared objects to use if the integer is small
|
|
* like it is most of the times. */
|
|
const int opt_hdr = ll < OBJ_SHARED_BULKHDR_LEN && ll >= 0;
|
|
const size_t hdr_len = OBJ_SHARED_HDR_STRLEN(ll);
|
|
if (prefix == '*' && opt_hdr) {
|
|
_addReplyToBufferOrList(c, shared.mbulkhdr[ll]->ptr, hdr_len);
|
|
return;
|
|
} else if (prefix == '$' && opt_hdr) {
|
|
_addReplyToBufferOrList(c, shared.bulkhdr[ll]->ptr, hdr_len);
|
|
return;
|
|
} else if (prefix == '%' && opt_hdr) {
|
|
_addReplyToBufferOrList(c, shared.maphdr[ll]->ptr, hdr_len);
|
|
return;
|
|
} else if (prefix == '~' && opt_hdr) {
|
|
_addReplyToBufferOrList(c, shared.sethdr[ll]->ptr, hdr_len);
|
|
return;
|
|
}
|
|
|
|
buf[0] = prefix;
|
|
len = ll2string(buf + 1, sizeof(buf) - 1, ll);
|
|
buf[len + 1] = '\r';
|
|
buf[len + 2] = '\n';
|
|
_addReplyToBufferOrList(c, buf, len + 3);
|
|
}
|
|
|
|
void addReplyLongLong(client *c, long long ll) {
|
|
if (ll == 0)
|
|
addReply(c, shared.czero);
|
|
else if (ll == 1)
|
|
addReply(c, shared.cone);
|
|
else {
|
|
if (prepareClientToWrite(c) != C_OK) return;
|
|
_addReplyLongLongWithPrefix(c, ll, ':');
|
|
}
|
|
}
|
|
|
|
void addReplyAggregateLen(client *c, long length, int prefix) {
|
|
serverAssert(length >= 0);
|
|
if (prepareClientToWrite(c) != C_OK) return;
|
|
_addReplyLongLongWithPrefix(c, length, prefix);
|
|
}
|
|
|
|
void addReplyArrayLen(client *c, long length) {
|
|
addReplyAggregateLen(c, length, '*');
|
|
}
|
|
|
|
void addWritePreparedReplyArrayLen(writePreparedClient *wpc, long length) {
|
|
client *c = (client *)wpc;
|
|
serverAssert(length >= 0);
|
|
_addReplyLongLongWithPrefix(c, length, '*');
|
|
}
|
|
|
|
void addReplyMapLen(client *c, long length) {
|
|
int prefix = c->resp == 2 ? '*' : '%';
|
|
if (c->resp == 2) length *= 2;
|
|
addReplyAggregateLen(c, length, prefix);
|
|
}
|
|
|
|
void addWritePreparedReplyMapLen(writePreparedClient *wpc, long length) {
|
|
client *c = (client *)wpc;
|
|
int prefix = c->resp == 2 ? '*' : '%';
|
|
if (c->resp == 2) length *= 2;
|
|
_addReplyLongLongWithPrefix(c, length, prefix);
|
|
}
|
|
|
|
void addReplySetLen(client *c, long length) {
|
|
int prefix = c->resp == 2 ? '*' : '~';
|
|
addReplyAggregateLen(c, length, prefix);
|
|
}
|
|
|
|
void addReplyAttributeLen(client *c, long length) {
|
|
serverAssert(c->resp >= 3);
|
|
addReplyAggregateLen(c, length, '|');
|
|
}
|
|
|
|
void addReplyPushLen(client *c, long length) {
|
|
serverAssert(c->resp >= 3);
|
|
serverAssertWithInfo(c, NULL, c->flag.pushing);
|
|
addReplyAggregateLen(c, length, '>');
|
|
}
|
|
|
|
void addReplyNull(client *c) {
|
|
if (c->resp == 2) {
|
|
addReplyProto(c, "$-1\r\n", 5);
|
|
} else {
|
|
addReplyProto(c, "_\r\n", 3);
|
|
}
|
|
}
|
|
|
|
void addReplyBool(client *c, int b) {
|
|
if (c->resp == 2) {
|
|
addReply(c, b ? shared.cone : shared.czero);
|
|
} else {
|
|
addReplyProto(c, b ? "#t\r\n" : "#f\r\n", 4);
|
|
}
|
|
}
|
|
|
|
/* A null array is a concept that no longer exists in RESP3. However
|
|
* RESP2 had it, so API-wise we have this call, that will emit the correct
|
|
* RESP2 protocol, however for RESP3 the reply will always be just the
|
|
* Null type "_\r\n". */
|
|
void addReplyNullArray(client *c) {
|
|
if (c->resp == 2) {
|
|
addReplyProto(c, "*-1\r\n", 5);
|
|
} else {
|
|
addReplyProto(c, "_\r\n", 3);
|
|
}
|
|
}
|
|
|
|
/* Create the length prefix of a bulk reply, example: $2234 */
|
|
void addReplyBulkLen(client *c, robj *obj) {
|
|
size_t len = stringObjectLen(obj);
|
|
if (prepareClientToWrite(c) != C_OK) return;
|
|
_addReplyLongLongWithPrefix(c, len, '$');
|
|
}
|
|
|
|
/* Try to avoid whole bulk string copy to a reply buffer
|
|
* If copy avoidance allowed then only pointer to object and string will be copied to the buffer */
|
|
static int tryAvoidBulkStrCopyToReply(client *c, robj *obj) {
|
|
if (!isCopyAvoidPreferred(c, obj)) return C_ERR;
|
|
if (prepareClientToWrite(c) != C_OK) return C_ERR;
|
|
|
|
_addBulkStrRefToBufferOrList(c, obj);
|
|
|
|
return C_OK;
|
|
}
|
|
|
|
/* Add an Object as a bulk reply */
|
|
void addReplyBulk(client *c, robj *obj) {
|
|
if (tryAvoidBulkStrCopyToReply(c, obj) == C_OK) {
|
|
/* If copy avoidance allowed, then we explicitly maintain net_output_bytes_curr_cmd. */
|
|
serverAssert(obj->encoding == OBJ_ENCODING_RAW);
|
|
size_t str_len = sdslen(obj->ptr);
|
|
uint32_t num_len = digits10(str_len);
|
|
/* RESP encodes bulk strings as $<length>\r\n<data>\r\n */
|
|
c->net_output_bytes_curr_cmd += (num_len + 3); /* $<length>\r\n */
|
|
c->net_output_bytes_curr_cmd += str_len; /* <data> */
|
|
c->net_output_bytes_curr_cmd += 2; /* \r\n */
|
|
return;
|
|
}
|
|
addReplyBulkLen(c, obj);
|
|
addReply(c, obj);
|
|
addReplyProto(c, "\r\n", 2);
|
|
}
|
|
|
|
/* Add a C buffer as bulk reply */
|
|
void addReplyBulkCBuffer(client *c, const void *p, size_t len) {
|
|
if (prepareClientToWrite(c) != C_OK) return;
|
|
_addReplyLongLongWithPrefix(c, len, '$');
|
|
_addReplyToBufferOrList(c, p, len);
|
|
_addReplyToBufferOrList(c, "\r\n", 2);
|
|
}
|
|
|
|
void addWritePreparedReplyBulkCBuffer(writePreparedClient *wpc, const void *p, size_t len) {
|
|
client *c = (client *)wpc;
|
|
_addReplyLongLongWithPrefix(c, len, '$');
|
|
_addReplyToBufferOrList(c, p, len);
|
|
_addReplyToBufferOrList(c, "\r\n", 2);
|
|
}
|
|
|
|
/* Add sds to reply (takes ownership of sds and frees it) */
|
|
void addReplyBulkSds(client *c, sds s) {
|
|
if (prepareClientToWrite(c) != C_OK) {
|
|
sdsfree(s);
|
|
return;
|
|
}
|
|
_addReplyLongLongWithPrefix(c, sdslen(s), '$');
|
|
_addReplyToBufferOrList(c, s, sdslen(s));
|
|
sdsfree(s);
|
|
_addReplyToBufferOrList(c, "\r\n", 2);
|
|
}
|
|
|
|
void addWritePreparedReplyBulkSds(writePreparedClient *wpc, sds s) {
|
|
client *c = (client *)wpc;
|
|
_addReplyLongLongWithPrefix(c, sdslen(s), '$');
|
|
_addReplyToBufferOrList(c, s, sdslen(s));
|
|
sdsfree(s);
|
|
_addReplyToBufferOrList(c, "\r\n", 2);
|
|
}
|
|
|
|
/* Set sds to a deferred reply (for symmetry with addReplyBulkSds it also frees the sds) */
|
|
void setDeferredReplyBulkSds(client *c, void *node, sds s) {
|
|
sds reply = sdscatprintf(sdsempty(), "$%d\r\n%s\r\n", (unsigned)sdslen(s), s);
|
|
setDeferredReply(c, node, reply, sdslen(reply));
|
|
sdsfree(reply);
|
|
sdsfree(s);
|
|
}
|
|
|
|
/* Add a C null term string as bulk reply */
|
|
void addReplyBulkCString(client *c, const char *s) {
|
|
if (s == NULL) {
|
|
addReplyNull(c);
|
|
} else {
|
|
addReplyBulkCBuffer(c, s, strlen(s));
|
|
}
|
|
}
|
|
|
|
/* Add a long long as a bulk reply */
|
|
void addReplyBulkLongLong(client *c, long long ll) {
|
|
char buf[64];
|
|
int len;
|
|
|
|
len = ll2string(buf, 64, ll);
|
|
addReplyBulkCBuffer(c, buf, len);
|
|
}
|
|
|
|
void addWritePreparedReplyBulkLongLong(writePreparedClient *wpc, long long ll) {
|
|
char buf[64];
|
|
int len;
|
|
|
|
len = ll2string(buf, 64, ll);
|
|
addWritePreparedReplyBulkCBuffer(wpc, buf, len);
|
|
}
|
|
|
|
/* Reply with a verbatim type having the specified extension.
|
|
*
|
|
* The 'ext' is the "extension" of the file, actually just a three
|
|
* character type that describes the format of the verbatim string.
|
|
* For instance "txt" means it should be interpreted as a text only
|
|
* file by the receiver, "md " as markdown, and so forth. Only the
|
|
* three first characters of the extension are used, and if the
|
|
* provided one is shorter than that, the remaining is filled with
|
|
* spaces. */
|
|
void addReplyVerbatim(client *c, const char *s, size_t len, const char *ext) {
|
|
if (c->resp == 2) {
|
|
addReplyBulkCBuffer(c, s, len);
|
|
} else {
|
|
char buf[32];
|
|
size_t preflen = snprintf(buf, sizeof(buf), "=%zu\r\nxxx:", len + 4);
|
|
char *p = buf + preflen - 4;
|
|
for (int i = 0; i < 3; i++) {
|
|
if (*ext == '\0') {
|
|
p[i] = ' ';
|
|
} else {
|
|
p[i] = *ext++;
|
|
}
|
|
}
|
|
addReplyProto(c, buf, preflen);
|
|
addReplyProto(c, s, len);
|
|
addReplyProto(c, "\r\n", 2);
|
|
}
|
|
}
|
|
|
|
/* This function is similar to the addReplyHelp function but adds the
|
|
* ability to pass in two arrays of strings. Some commands have
|
|
* some additional subcommands based on the specific feature implementation
|
|
* the server is compiled with (currently just clustering). This function allows
|
|
* to pass is the common subcommands in `help` and any implementation
|
|
* specific subcommands in `extended_help`.
|
|
*/
|
|
void addExtendedReplyHelp(client *c, const char **help, const char **extended_help) {
|
|
sds cmd = sdsnew((char *)c->argv[0]->ptr);
|
|
void *blenp = addReplyDeferredLen(c);
|
|
int blen = 0;
|
|
int idx = 0;
|
|
|
|
sdstoupper(cmd);
|
|
addReplyStatusFormat(c, "%s <subcommand> [<arg> [value] [opt] ...]. Subcommands are:", cmd);
|
|
sdsfree(cmd);
|
|
|
|
while (help[blen]) addReplyStatus(c, help[blen++]);
|
|
if (extended_help) {
|
|
while (extended_help[idx]) addReplyStatus(c, extended_help[idx++]);
|
|
}
|
|
blen += idx;
|
|
|
|
addReplyStatus(c, "HELP");
|
|
addReplyStatus(c, " Print this help.");
|
|
|
|
blen += 1; /* Account for the header. */
|
|
blen += 2; /* Account for the footer. */
|
|
setDeferredArrayLen(c, blenp, blen);
|
|
}
|
|
|
|
/* Add an array of C strings as status replies with a heading.
|
|
* This function is typically invoked by commands that support
|
|
* subcommands in response to the 'help' subcommand. The help array
|
|
* is terminated by NULL sentinel. */
|
|
void addReplyHelp(client *c, const char **help) {
|
|
addExtendedReplyHelp(c, help, NULL);
|
|
}
|
|
|
|
/* Add a suggestive error reply.
|
|
* This function is typically invoked by from commands that support
|
|
* subcommands in response to an unknown subcommand or argument error. */
|
|
void addReplySubcommandSyntaxError(client *c) {
|
|
sds cmd = sdsnew((char *)c->argv[0]->ptr);
|
|
sdstoupper(cmd);
|
|
addReplyErrorFormat(c, "unknown subcommand or wrong number of arguments for '%.128s'. Try %s HELP.",
|
|
(char *)c->argv[1]->ptr, cmd);
|
|
sdsfree(cmd);
|
|
}
|
|
|
|
inline int isDeferredReplyEnabled(client *c) {
|
|
return c->deferred_reply_bytes != ULLONG_MAX;
|
|
}
|
|
|
|
/* Commands that generate replies before triggering keyspace notifications must
|
|
* use a deferred reply buffer. This allows postponing the actual transmission
|
|
* of the reply until after the client is unblocked, in case it was blocked by
|
|
* a keyspace notification. This is necessary because modules subscribed to
|
|
* keyspace notifications can block the client from within the notification
|
|
* callback. */
|
|
void initDeferredReplyBuffer(client *c) {
|
|
if (moduleNotifyKeyspaceSubscribersCnt() == 0) return;
|
|
if (c->deferred_reply == NULL) {
|
|
c->deferred_reply = listCreate();
|
|
listSetFreeMethod(c->deferred_reply, freeClientReplyValue);
|
|
}
|
|
if (!isDeferredReplyEnabled(c)) c->deferred_reply_bytes = 0;
|
|
}
|
|
|
|
static void resetDeferredReplyBuffer(client *c) {
|
|
listEmpty(c->deferred_reply);
|
|
c->deferred_reply_bytes = ULLONG_MAX;
|
|
}
|
|
|
|
/* Move the client deferred reply buffer into the client reply buffer and put the client
|
|
* in the pending write queue. */
|
|
void commitDeferredReplyBuffer(client *c, int skip_if_blocked) {
|
|
if (skip_if_blocked && c->flag.blocked) return;
|
|
|
|
if (!isDeferredReplyEnabled(c) || (c->deferred_reply && listLength(c->deferred_reply) == 0)) {
|
|
resetDeferredReplyBuffer(c);
|
|
return;
|
|
}
|
|
|
|
listJoin(c->reply, c->deferred_reply);
|
|
c->reply_bytes += c->deferred_reply_bytes;
|
|
|
|
resetDeferredReplyBuffer(c);
|
|
if (prepareClientToWrite(c) != C_OK) {
|
|
return;
|
|
}
|
|
/* We call it here because this function may affect the reply
|
|
* buffer offset (see function comment) */
|
|
reqresSaveClientReplyOffset(c);
|
|
}
|
|
|
|
/* Append 'src' client output buffers into 'dst' client output buffers.
|
|
* This function clears the output buffers of 'src' */
|
|
void AddReplyFromClient(client *dst, client *src) {
|
|
/* If the source client contains a partial response due to client output
|
|
* buffer limits, propagate that to the dest rather than copy a partial
|
|
* reply. We don't wanna run the risk of copying partial response in case
|
|
* for some reason the output limits don't reach the same decision (maybe
|
|
* they changed) */
|
|
if (src->flag.close_asap) {
|
|
sds client = catClientInfoString(sdsempty(), dst, server.hide_user_data_from_log);
|
|
freeClientAsync(dst);
|
|
serverLog(LL_WARNING, "Client %s scheduled to be closed ASAP for overcoming of output buffer limits.", client);
|
|
sdsfree(client);
|
|
return;
|
|
}
|
|
|
|
/* First add the static buffer (either into the static buffer or reply list) */
|
|
serverAssert(src->flag.buf_encoded == 0);
|
|
addReplyProto(dst, src->buf, src->bufpos);
|
|
|
|
/* We need to check with prepareClientToWrite again (after addReplyProto)
|
|
* since addReplyProto may have changed something (like CLIENT_CLOSE_ASAP) */
|
|
if (prepareClientToWrite(dst) != C_OK) return;
|
|
|
|
/* We're bypassing _addReplyProtoToList, so we need to add the pre/post
|
|
* checks in it. */
|
|
if (dst->flag.close_after_reply) return;
|
|
|
|
/* Concatenate the reply list into the dest */
|
|
if (listLength(src->reply)) listJoin(dst->reply, src->reply);
|
|
dst->reply_bytes += src->reply_bytes;
|
|
src->reply_bytes = 0;
|
|
src->bufpos = 0;
|
|
|
|
if (src->deferred_reply_errors) {
|
|
deferredAfterErrorReply(dst, src->deferred_reply_errors);
|
|
listRelease(src->deferred_reply_errors);
|
|
src->deferred_reply_errors = NULL;
|
|
}
|
|
|
|
/* Check output buffer limits */
|
|
closeClientOnOutputBufferLimitReached(dst, 1);
|
|
}
|
|
|
|
/* Append the listed errors to the server error statistics. the input
|
|
* list is not modified and remains the responsibility of the caller. */
|
|
void deferredAfterErrorReply(client *c, list *errors) {
|
|
listIter li;
|
|
listNode *ln;
|
|
listRewind(errors, &li);
|
|
while ((ln = listNext(&li))) {
|
|
sds err = ln->value;
|
|
afterErrorReply(c, err, sdslen(err), 0);
|
|
}
|
|
}
|
|
|
|
/* Logically copy 'src' replica client buffers info to 'dst' replica.
|
|
* Basically increase referenced buffer block node reference count. */
|
|
void copyReplicaOutputBuffer(client *dst, client *src) {
|
|
serverAssert(src->bufpos == 0 && listLength(src->reply) == 0);
|
|
|
|
if (src->repl_data->ref_repl_buf_node == NULL) return;
|
|
dst->repl_data->ref_repl_buf_node = src->repl_data->ref_repl_buf_node;
|
|
dst->repl_data->ref_block_pos = src->repl_data->ref_block_pos;
|
|
((replBufBlock *)listNodeValue(dst->repl_data->ref_repl_buf_node))->refcount++;
|
|
}
|
|
|
|
/* Return true if the specified client has pending reply buffers to write to
|
|
* the socket. */
|
|
int clientHasPendingReplies(client *c) {
|
|
if (getClientType(c) == CLIENT_TYPE_REPLICA) {
|
|
/* Replicas use global shared replication buffer instead of
|
|
* private output buffer. */
|
|
serverAssert(c->bufpos == 0 && listLength(c->reply) == 0);
|
|
if (c->repl_data->ref_repl_buf_node == NULL) return 0;
|
|
|
|
/* If the last replication buffer block content is totally sent,
|
|
* we have nothing to send. */
|
|
listNode *ln = listLast(server.repl_buffer_blocks);
|
|
replBufBlock *tail = listNodeValue(ln);
|
|
if (ln == c->repl_data->ref_repl_buf_node && c->repl_data->ref_block_pos == tail->used) return 0;
|
|
|
|
return 1;
|
|
} else {
|
|
return c->bufpos || listLength(c->reply);
|
|
}
|
|
}
|
|
|
|
void clientAcceptHandler(connection *conn) {
|
|
client *c = connGetPrivateData(conn);
|
|
|
|
if (connGetState(conn) != CONN_STATE_CONNECTED) {
|
|
serverLog(LL_WARNING, "Error accepting a client connection: %s (addr=%s laddr=%s)", connGetLastError(conn),
|
|
getClientPeerId(c), getClientSockname(c));
|
|
freeClientAsync(c);
|
|
return;
|
|
}
|
|
|
|
/* If the server is running in protected mode (the default) and there
|
|
* is no password set, nor a specific interface is bound, we don't accept
|
|
* requests from non loopback interfaces. Instead we try to explain the
|
|
* user what to do to fix it if needed. */
|
|
if (server.protected_mode && DefaultUser->flags & USER_FLAG_NOPASS) {
|
|
if (connIsLocal(conn) != 1) {
|
|
char *err = "-DENIED Running in protected mode because protected "
|
|
"mode is enabled and no password is set for the default user. "
|
|
"In this mode connections are only accepted from the loopback interface. "
|
|
"If you want to connect from external computers, you "
|
|
"may adopt one of the following solutions: "
|
|
"1) Just disable protected mode sending the command "
|
|
"'CONFIG SET protected-mode no' from the loopback interface "
|
|
"by connecting from the same host the server is "
|
|
"running, however MAKE SURE it's not publicly accessible "
|
|
"from internet if you do so. Use CONFIG REWRITE to make this "
|
|
"change permanent. "
|
|
"2) Alternatively you can just disable the protected mode by "
|
|
"editing the configuration file, and setting the protected "
|
|
"mode option to 'no', and then restarting the server. "
|
|
"3) If you started the server manually just for testing, restart "
|
|
"it with the '--protected-mode no' option. "
|
|
"4) Set up an authentication password for the default user. "
|
|
"NOTE: You only need to do one of the above things in order for "
|
|
"the server to start accepting connections from the outside.\r\n";
|
|
if (connWrite(c->conn, err, strlen(err)) == -1) {
|
|
/* Nothing to do, Just to avoid the warning... */
|
|
}
|
|
server.stat_rejected_conn++;
|
|
freeClientAsync(c);
|
|
return;
|
|
}
|
|
}
|
|
|
|
/* Auto-authenticate from cert_user field if set */
|
|
sds username = connGetPeerUsername(conn);
|
|
if (username != NULL) {
|
|
user *u = ACLGetUserByName(username, sdslen(username));
|
|
if (u && (u->flags & USER_FLAG_ENABLED)) {
|
|
clientSetUser(c, u, 1);
|
|
moduleNotifyUserChanged(c);
|
|
serverLog(LL_VERBOSE, "TLS: Auto-authenticated client as %s",
|
|
server.hide_user_data_from_log ? "*redacted*" : u->name);
|
|
} else {
|
|
addACLLogEntry(c, ACL_INVALID_TLS_CERT_AUTH, ACL_LOG_CTX_TOPLEVEL, 0, username, NULL);
|
|
}
|
|
sdsfree(username);
|
|
}
|
|
|
|
server.stat_numconnections++;
|
|
moduleFireServerEvent(VALKEYMODULE_EVENT_CLIENT_CHANGE, VALKEYMODULE_SUBEVENT_CLIENT_CHANGE_CONNECTED, c);
|
|
}
|
|
|
|
void acceptCommonHandler(connection *conn, struct ClientFlags flags, char *ip) {
|
|
client *c;
|
|
UNUSED(ip);
|
|
|
|
char addr[CONN_ADDR_STR_LEN] = {0};
|
|
char laddr[CONN_ADDR_STR_LEN] = {0};
|
|
connFormatAddr(conn, addr, sizeof(addr), 1);
|
|
connFormatAddr(conn, laddr, sizeof(addr), 0);
|
|
|
|
if (connGetState(conn) != CONN_STATE_ACCEPTING) {
|
|
serverLog(LL_VERBOSE, "Accepted client connection in error state: %s (addr=%s laddr=%s)",
|
|
connGetLastError(conn), addr, laddr);
|
|
connClose(conn);
|
|
return;
|
|
}
|
|
|
|
/* Limit the number of connections we take at the same time.
|
|
*
|
|
* Admission control will happen before a client is created and connAccept()
|
|
* called, because we don't want to even start transport-level negotiation
|
|
* if rejected. */
|
|
if (listLength(server.clients) + getClusterConnectionsCount() >= server.maxclients) {
|
|
char *err;
|
|
if (server.cluster_enabled)
|
|
err = "-ERR max number of clients + cluster "
|
|
"connections reached\r\n";
|
|
else
|
|
err = "-ERR max number of clients reached\r\n";
|
|
|
|
/* That's a best effort error message, don't check write errors.
|
|
* Note that for TLS connections, no handshake was done yet so nothing
|
|
* is written and the connection will just drop. */
|
|
if (connWrite(conn, err, strlen(err)) == -1) {
|
|
/* Nothing to do, Just to avoid the warning... */
|
|
}
|
|
server.stat_rejected_conn++;
|
|
connClose(conn);
|
|
return;
|
|
}
|
|
|
|
/* Create connection and client */
|
|
if ((c = createClient(conn)) == NULL) {
|
|
serverLog(LL_WARNING, "Error registering fd event for the new client connection: %s (addr=%s laddr=%s)",
|
|
connGetLastError(conn), addr, laddr);
|
|
connClose(conn); /* May be already closed, just ignore errors */
|
|
return;
|
|
}
|
|
|
|
/* Last chance to keep flags */
|
|
if (flags.unix_socket) c->flag.unix_socket = 1;
|
|
|
|
/* Initiate accept.
|
|
*
|
|
* Note that connAccept() is free to do two things here:
|
|
* 1. Call clientAcceptHandler() immediately;
|
|
* 2. Schedule a future call to clientAcceptHandler().
|
|
*
|
|
* Because of that, we must do nothing else afterwards.
|
|
*/
|
|
if (connAccept(conn, clientAcceptHandler) == C_ERR) {
|
|
if (connGetState(conn) == CONN_STATE_ERROR)
|
|
serverLog(LL_WARNING, "Error accepting a client connection: %s (addr=%s laddr=%s)", connGetLastError(conn),
|
|
getClientPeerId(c), getClientSockname(c));
|
|
freeClient(connGetPrivateData(conn));
|
|
return;
|
|
}
|
|
}
|
|
|
|
void freeClientOriginalArgv(client *c) {
|
|
/* We didn't rewrite this client */
|
|
if (!c->original_argv) return;
|
|
|
|
if (tryOffloadFreeArgvToIOThreads(c, c->original_argc, c->original_argv) == C_ERR) {
|
|
for (int j = 0; j < c->original_argc; j++) decrRefCount(c->original_argv[j]);
|
|
zfree(c->original_argv);
|
|
}
|
|
|
|
c->original_argv = NULL;
|
|
c->original_argc = 0;
|
|
}
|
|
|
|
void freeClientArgv(client *c) {
|
|
/* If original_argv exists, 'c->argv' was allocated by the main thread,
|
|
* so it's more efficient to free it directly here rather than offloading to IO threads */
|
|
if (c->original_argv || tryOffloadFreeArgvToIOThreads(c, c->argc, c->argv) == C_ERR) {
|
|
for (int j = 0; j < c->argc; j++) decrRefCount(c->argv[j]);
|
|
zfree(c->argv);
|
|
}
|
|
c->argc = 0;
|
|
c->cmd = NULL;
|
|
c->parsed_cmd = NULL;
|
|
c->argv_len_sum = 0;
|
|
c->argv_len = 0;
|
|
c->argv = NULL;
|
|
}
|
|
|
|
/* Close all the replicas connections. This is useful in chained replication
|
|
* when we resync with our own primary and want to force all our replicas to
|
|
* resync with us as well. */
|
|
void disconnectReplicas(void) {
|
|
listIter li;
|
|
listNode *ln;
|
|
listRewind(server.replicas, &li);
|
|
while ((ln = listNext(&li))) {
|
|
freeClient((client *)ln->value);
|
|
}
|
|
}
|
|
|
|
/* Remove the specified client from global lists where the client could
|
|
* be referenced, not including the Pub/Sub channels.
|
|
* This is used by freeClient() and replicationCachePrimary(). */
|
|
void unlinkClient(client *c) {
|
|
listNode *ln;
|
|
|
|
/* Wait for IO operations to be done before unlinking the client. */
|
|
waitForClientIO(c);
|
|
|
|
/* If this is marked as current client unset it. */
|
|
if (c->conn && server.current_client == c) server.current_client = NULL;
|
|
|
|
/* Certain operations must be done only if the client has an active connection.
|
|
* If the client was already unlinked or if it's a "fake client" the
|
|
* conn is already set to NULL. */
|
|
if (c->conn) {
|
|
/* Remove from the list of active clients. */
|
|
if (c->client_list_node) {
|
|
uint64_t id = htonu64(c->id);
|
|
raxRemove(server.clients_index, (unsigned char *)&id, sizeof(id), NULL);
|
|
listDelNode(server.clients, c->client_list_node);
|
|
c->client_list_node = NULL;
|
|
}
|
|
removeClientFromPendingCommandsBatch(c);
|
|
|
|
/* Check if this is a replica waiting for diskless replication (rdb pipe),
|
|
* in which case it needs to be cleaned from that list.
|
|
*
|
|
* Alternatively, if this is a slot migration job for an export operation, we need to
|
|
* always check if this was the target. The state of the migration isn't relevant since the
|
|
* snapshot child may take some time to die, during which the migration will continue past
|
|
* the snapshot state. */
|
|
if (c->repl_data && server.rdb_pipe_conns &&
|
|
((c->flag.replica && c->repl_data->repl_state == REPLICA_STATE_WAIT_BGSAVE_END))) {
|
|
int i;
|
|
int still_alive = 0;
|
|
for (i = 0; i < server.rdb_pipe_numconns; i++) {
|
|
if (server.rdb_pipe_conns[i] == c->conn) {
|
|
rdbPipeWriteHandlerConnRemoved(c->conn);
|
|
server.rdb_pipe_conns[i] = NULL;
|
|
}
|
|
if (server.rdb_pipe_conns[i]) still_alive++;
|
|
}
|
|
if (still_alive == 0) {
|
|
serverLog(LL_NOTICE, "Diskless rdb transfer, last replica dropped, killing fork child.");
|
|
killRDBChild();
|
|
}
|
|
}
|
|
/* Check if this is the slot migration client we are writing to in a
|
|
* child process*/
|
|
if (c->slot_migration_job && !isImportSlotMigrationJob(c->slot_migration_job) &&
|
|
server.slot_migration_pipe_conn == c->conn) {
|
|
server.slot_migration_pipe_conn = NULL;
|
|
serverLog(LL_NOTICE, "Slot migration target dropped, killing fork child.");
|
|
killSlotMigrationChild();
|
|
}
|
|
/* Only use shutdown when the fork is active and we are the parent. */
|
|
if (server.child_type && !c->flag.repl_rdb_channel) {
|
|
connShutdown(c->conn);
|
|
} else if (c->flag.repl_rdb_channel) {
|
|
shutdown(c->conn->fd, SHUT_RDWR);
|
|
}
|
|
connClose(c->conn);
|
|
c->conn = NULL;
|
|
}
|
|
|
|
/* Remove from the list of pending writes if needed. */
|
|
if (c->flag.pending_write) {
|
|
if (c->io_write_state == CLIENT_IDLE) {
|
|
listUnlinkNode(server.clients_pending_write, &c->clients_pending_write_node);
|
|
} else {
|
|
listUnlinkNode(server.clients_pending_io_write, &c->clients_pending_write_node);
|
|
}
|
|
c->flag.pending_write = 0;
|
|
}
|
|
|
|
/* Remove from the list of pending reads if needed. */
|
|
serverAssert(c->io_read_state != CLIENT_PENDING_IO && c->io_write_state != CLIENT_PENDING_IO);
|
|
if (c->flag.pending_read) {
|
|
listUnlinkNode(server.clients_pending_io_read, &c->pending_read_list_node);
|
|
c->flag.pending_read = 0;
|
|
}
|
|
|
|
|
|
/* When client was just unblocked because of a blocking operation,
|
|
* remove it from the list of unblocked clients. */
|
|
if (c->flag.unblocked) {
|
|
ln = listSearchKey(server.unblocked_clients, c);
|
|
serverAssert(ln != NULL);
|
|
listDelNode(server.unblocked_clients, ln);
|
|
c->flag.unblocked = 0;
|
|
}
|
|
|
|
/* Clear the tracking status. */
|
|
if (c->flag.tracking) disableTracking(c);
|
|
}
|
|
|
|
/* Clear the client state to resemble a newly connected client. */
|
|
void clearClientConnectionState(client *c) {
|
|
listNode *ln;
|
|
|
|
/* MONITOR clients are also marked with CLIENT_REPLICA, we need to
|
|
* distinguish between the two.
|
|
*/
|
|
if (c->flag.monitor) {
|
|
ln = listSearchKey(server.monitors, c);
|
|
serverAssert(ln != NULL);
|
|
listDelNode(server.monitors, ln);
|
|
|
|
c->flag.monitor = 0;
|
|
c->flag.replica = 0;
|
|
}
|
|
|
|
serverAssert(!(c->flag.replica || c->flag.primary || c->slot_migration_job));
|
|
|
|
if (c->flag.tracking) disableTracking(c);
|
|
selectDb(c, 0);
|
|
#ifdef LOG_REQ_RES
|
|
c->resp = server.client_default_resp;
|
|
#else
|
|
c->resp = 2;
|
|
#endif
|
|
|
|
clientSetDefaultAuth(c);
|
|
moduleNotifyUserChanged(c);
|
|
discardTransaction(c);
|
|
freeClientPubSubData(c);
|
|
|
|
if (c->name) {
|
|
decrRefCount(c->name);
|
|
c->name = NULL;
|
|
}
|
|
|
|
/* Note: lib_name and lib_ver are not reset since they still
|
|
* represent the client library behind the connection. */
|
|
|
|
/* Selectively clear state flags not covered above */
|
|
c->flag.asking = 0;
|
|
c->flag.readonly = 0;
|
|
c->flag.reply_off = 0;
|
|
c->flag.reply_skip_next = 0;
|
|
c->flag.no_touch = 0;
|
|
c->flag.no_evict = 0;
|
|
}
|
|
|
|
void freeClient(client *c) {
|
|
listNode *ln;
|
|
|
|
/* If a client is protected, yet we need to free it right now, make sure
|
|
* to at least use asynchronous freeing. */
|
|
if (c->flag.protected || c->flag.protected_rdb_channel) {
|
|
freeClientAsync(c);
|
|
return;
|
|
}
|
|
|
|
/* Wait for IO operations to be done before proceeding */
|
|
waitForClientIO(c);
|
|
|
|
/* For connected clients, call the disconnection event of modules hooks. */
|
|
if (c->conn) {
|
|
moduleFireServerEvent(VALKEYMODULE_EVENT_CLIENT_CHANGE, VALKEYMODULE_SUBEVENT_CLIENT_CHANGE_DISCONNECTED, c);
|
|
}
|
|
|
|
/* Notify module system that this client auth status changed. */
|
|
moduleNotifyUserChanged(c);
|
|
freeClientModuleData(c);
|
|
|
|
/* If this client was scheduled for async freeing we need to remove it
|
|
* from the queue. Note that we need to do this here, because later
|
|
* we may call replicationCachePrimary() and the client should already
|
|
* be removed from the list of clients to free. */
|
|
if (c->flag.close_asap) {
|
|
ln = listSearchKey(server.clients_to_close, c);
|
|
serverAssert(ln != NULL);
|
|
listDelNode(server.clients_to_close, ln);
|
|
}
|
|
|
|
/* If it is our primary that's being disconnected we should make sure
|
|
* to cache the state to try a partial resynchronization later.
|
|
*
|
|
* Note that before doing this we make sure that the client is not in
|
|
* some unexpected state, by checking its flags. */
|
|
if (server.primary && c->flag.primary) {
|
|
serverLog(LL_NOTICE, "Connection with primary lost.");
|
|
if (!c->flag.dont_cache_primary && !(c->flag.protocol_error || c->flag.blocked)) {
|
|
c->flag.close_asap = 0;
|
|
c->flag.close_after_reply = 0;
|
|
replicationCachePrimary(c);
|
|
return;
|
|
}
|
|
}
|
|
|
|
/* Log link disconnection with replica */
|
|
if (getClientType(c) == CLIENT_TYPE_REPLICA) {
|
|
if (c->flag.repl_rdb_channel)
|
|
dualChannelServerLog(LL_NOTICE, "Replica %s rdb channel disconnected.", replicationGetReplicaName(c));
|
|
else
|
|
serverLog(LL_NOTICE, "Connection with replica %s lost.", replicationGetReplicaName(c));
|
|
}
|
|
|
|
/* Handle slot migration connection closed. */
|
|
if (c->slot_migration_job) {
|
|
clusterHandleSlotMigrationClientClose(c->slot_migration_job);
|
|
}
|
|
|
|
/* Free the query buffer */
|
|
if (c->querybuf && c->querybuf == thread_shared_qb) {
|
|
sdsclear(c->querybuf);
|
|
} else {
|
|
sdsfree(c->querybuf);
|
|
}
|
|
c->querybuf = NULL;
|
|
|
|
/* Deallocate structures used to block on blocking ops. */
|
|
/* If there is any in-flight command, we don't record their duration. */
|
|
c->duration = 0;
|
|
if (c->flag.blocked) unblockClient(c, 1);
|
|
|
|
freeClientBlockingState(c);
|
|
freeClientPubSubData(c);
|
|
|
|
/* Free data structures. */
|
|
releaseReplyReferences(c);
|
|
listRelease(c->reply);
|
|
c->reply = NULL;
|
|
zfree_with_size(c->buf, c->buf_usable_size);
|
|
c->buf = NULL;
|
|
listRelease(c->deferred_reply);
|
|
|
|
freeClientArgv(c);
|
|
freeClientOriginalArgv(c);
|
|
discardCommandQueue(c);
|
|
if (c->deferred_reply_errors) listRelease(c->deferred_reply_errors);
|
|
c->deferred_reply_errors = NULL;
|
|
#ifdef LOG_REQ_RES
|
|
reqresReset(c, 1);
|
|
#endif
|
|
|
|
/* Remove the contribution that this client gave to our
|
|
* incrementally computed memory usage. */
|
|
if (c->conn) server.stat_clients_type_memory[c->last_memory_type] -= c->last_memory_usage;
|
|
|
|
/* Unlink the client: this will close the socket, remove the I/O
|
|
* handlers, and remove references of the client from different
|
|
* places where active clients may be referenced. */
|
|
unlinkClient(c);
|
|
|
|
freeClientReplicationData(c);
|
|
|
|
/* Remove client from memory usage buckets */
|
|
if (c->mem_usage_bucket) {
|
|
c->mem_usage_bucket->mem_usage_sum -= c->last_memory_usage;
|
|
listDelNode(c->mem_usage_bucket->clients, c->mem_usage_bucket_node);
|
|
}
|
|
|
|
/* Release other dynamically allocated client structure fields,
|
|
* and finally release the client structure itself. */
|
|
if (c->name) decrRefCount(c->name);
|
|
if (c->lib_name) decrRefCount(c->lib_name);
|
|
if (c->lib_ver) decrRefCount(c->lib_ver);
|
|
freeClientMultiState(c);
|
|
sdsfree(c->peerid);
|
|
sdsfree(c->sockname);
|
|
zfree(c);
|
|
}
|
|
|
|
/* Schedule a client to free it at a safe time in the beforeSleep() function.
|
|
* This function is useful when we need to terminate a client but we are in
|
|
* a context where calling freeClient() is not possible, because the client
|
|
* should be valid for the continuation of the flow of the program. */
|
|
void freeClientAsync(client *c) {
|
|
if (c->flag.close_asap || c->flag.script) return;
|
|
c->flag.close_asap = 1;
|
|
debugServerAssertWithInfo(c, NULL, listSearchKey(server.clients_to_close, c) == NULL);
|
|
listAddNodeTail(server.clients_to_close, c);
|
|
}
|
|
|
|
/* Log errors for invalid use and free the client in async way.
|
|
* We will add additional information about the client to the message. */
|
|
void logInvalidUseAndFreeClientAsync(client *c, const char *fmt, ...) {
|
|
va_list ap;
|
|
va_start(ap, fmt);
|
|
sds info = sdscatvprintf(sdsempty(), fmt, ap);
|
|
va_end(ap);
|
|
|
|
sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log);
|
|
serverLog(LL_WARNING, "%s, disconnecting it: %s", info, client);
|
|
|
|
sdsfree(info);
|
|
sdsfree(client);
|
|
freeClientAsync(c);
|
|
}
|
|
|
|
/* Resets the shared query buffer used by the given client.
|
|
* If any data remained in the buffer, the client will take ownership of the buffer
|
|
* and a new empty buffer will be allocated for the shared buffer. */
|
|
void resetSharedQueryBuf(client *c) {
|
|
serverAssert(c->querybuf == thread_shared_qb);
|
|
size_t remaining = sdslen(c->querybuf) - c->qb_pos;
|
|
|
|
if (remaining > 0) {
|
|
/* Let the client take ownership of the shared buffer. */
|
|
initSharedQueryBuf();
|
|
return;
|
|
}
|
|
|
|
c->querybuf = NULL;
|
|
sdsclear(thread_shared_qb);
|
|
c->qb_pos = 0;
|
|
}
|
|
|
|
/* Trims the client query buffer to the current position. */
|
|
void trimClientQueryBuffer(client *c) {
|
|
if (c->querybuf == thread_shared_qb) {
|
|
resetSharedQueryBuf(c);
|
|
}
|
|
|
|
if (c->querybuf == NULL) {
|
|
return;
|
|
}
|
|
|
|
serverAssert(c->qb_pos <= sdslen(c->querybuf));
|
|
|
|
if (c->qb_pos > 0) {
|
|
sdsrange(c->querybuf, c->qb_pos, -1);
|
|
c->qb_pos = 0;
|
|
}
|
|
}
|
|
|
|
/* Perform processing of the client before moving on to processing the next client.
|
|
* This is useful for performing operations that affect the global state but can't
|
|
* wait until we're done with all clients. In other words, it can't wait until beforeSleep().
|
|
* With IO threads enabled, this function offloads the write to the IO threads if possible. */
|
|
void beforeNextClient(client *c) {
|
|
/* Notice, this code is also called from 'processUnblockedClients'.
|
|
* But in case of a module blocked client (see RM_Call 'K' flag) we do not reach this code path.
|
|
* So whenever we change the code here we need to consider if we need this change on module
|
|
* blocked client as well */
|
|
|
|
/* Trim the query buffer to the current position. */
|
|
if (isReplicatedClient(c)) {
|
|
/* If the client is replicated, trim the querybuf to repl_applied,
|
|
* since primary client is very special, its querybuf not only
|
|
* used to parse command, but also proxy to sub-replicas.
|
|
*
|
|
* Here are some scenarios we cannot trim to qb_pos:
|
|
* 1. we don't receive complete command from primary
|
|
* 2. primary client blocked cause of client pause
|
|
* 3. io threads operate read, primary client flagged with CLIENT_PENDING_COMMAND
|
|
*
|
|
* In these scenarios, qb_pos points to the part of the current command
|
|
* or the beginning of next command, and the current command is not applied yet,
|
|
* so the repl_applied is not equal to qb_pos. */
|
|
if (c->repl_data->repl_applied) {
|
|
sdsrange(c->querybuf, c->repl_data->repl_applied, -1);
|
|
c->qb_pos -= c->repl_data->repl_applied;
|
|
c->repl_data->repl_applied = 0;
|
|
}
|
|
} else {
|
|
trimClientQueryBuffer(c);
|
|
}
|
|
/* Handle async frees */
|
|
/* Note: this doesn't make the server.clients_to_close list redundant because of
|
|
* cases where we want an async free of a client other than myself. For example
|
|
* in ACL modifications we disconnect clients authenticated to non-existent
|
|
* users (see ACL LOAD). */
|
|
if (c->flag.close_asap) {
|
|
freeClient(c);
|
|
return;
|
|
}
|
|
|
|
updateClientMemUsageAndBucket(c);
|
|
/* If IO threads are enabled try to write immediately the reply instead of waiting to beforeSleep,
|
|
* unless aof_fsync is set to always in which case we need to wait for beforeSleep after writing the aof buffer. */
|
|
if (server.aof_fsync != AOF_FSYNC_ALWAYS) {
|
|
trySendWriteToIOThreads(c);
|
|
}
|
|
}
|
|
|
|
/* Free the clients marked as CLOSE_ASAP, return the number of clients
|
|
* freed. */
|
|
int freeClientsInAsyncFreeQueue(void) {
|
|
int freed = 0;
|
|
listIter li;
|
|
listNode *ln;
|
|
|
|
listRewind(server.clients_to_close, &li);
|
|
while ((ln = listNext(&li)) != NULL) {
|
|
client *c = listNodeValue(ln);
|
|
|
|
if (c->flag.protected_rdb_channel) {
|
|
/* Check if it's safe to remove RDB connection protection during synchronization
|
|
* The primary gives a grace period before freeing this client because
|
|
* it serves as a reference to the first required replication data block for
|
|
* this replica */
|
|
if (!c->repl_data->rdb_client_disconnect_time) {
|
|
if (c->conn) connSetReadHandler(c->conn, NULL);
|
|
c->repl_data->rdb_client_disconnect_time = server.unixtime;
|
|
dualChannelServerLog(LL_VERBOSE, "Postpone RDB client id=%llu (%s) free for %d seconds",
|
|
(unsigned long long)c->id, replicationGetReplicaName(c), server.wait_before_rdb_client_free);
|
|
}
|
|
if (server.unixtime - c->repl_data->rdb_client_disconnect_time <= server.wait_before_rdb_client_free) continue;
|
|
dualChannelServerLog(
|
|
LL_NOTICE,
|
|
"Replica main channel failed to establish PSYNC within the grace period (%ld seconds). "
|
|
"Freeing RDB client %llu.",
|
|
(long int)(server.unixtime - c->repl_data->rdb_client_disconnect_time), (unsigned long long)c->id);
|
|
c->flag.protected_rdb_channel = 0;
|
|
}
|
|
|
|
if (c->flag.protected) continue;
|
|
|
|
c->flag.close_asap = 0;
|
|
freeClient(c);
|
|
listDelNode(server.clients_to_close, ln);
|
|
freed++;
|
|
}
|
|
return freed;
|
|
}
|
|
|
|
/* Return a client by ID, or NULL if the client ID is not in the set
|
|
* of registered clients. Note that "fake clients", created with -1 as FD,
|
|
* are not registered clients. */
|
|
client *lookupClientByID(uint64_t id) {
|
|
id = htonu64(id);
|
|
void *c = NULL;
|
|
raxFind(server.clients_index, (unsigned char *)&id, sizeof(id), &c);
|
|
return c;
|
|
}
|
|
|
|
static void postWriteToReplica(client *c) {
|
|
if (c->nwritten <= 0) return;
|
|
|
|
server.stat_net_repl_output_bytes += c->nwritten;
|
|
|
|
/* Locate the last node which has leftover data and
|
|
* decrement reference counts of all nodes in front of it.
|
|
* Set c->ref_repl_buf_node to point to the last node and
|
|
* c->ref_block_pos to the offset within that node */
|
|
listNode *curr = c->repl_data->ref_repl_buf_node;
|
|
listNode *next = NULL;
|
|
size_t nwritten = c->nwritten + c->repl_data->ref_block_pos;
|
|
replBufBlock *o = listNodeValue(curr);
|
|
|
|
while (nwritten >= o->used) {
|
|
next = listNextNode(curr);
|
|
if (!next) break; /* End of list */
|
|
|
|
nwritten -= o->used;
|
|
o->refcount--;
|
|
|
|
curr = next;
|
|
o = listNodeValue(curr);
|
|
o->refcount++;
|
|
}
|
|
|
|
serverAssert(nwritten <= o->used);
|
|
c->repl_data->ref_repl_buf_node = curr;
|
|
c->repl_data->ref_block_pos = nwritten;
|
|
|
|
incrementalTrimReplicationBacklog(REPL_BACKLOG_TRIM_BLOCKS_PER_CALL);
|
|
}
|
|
|
|
static void writeToReplica(client *c) {
|
|
listNode *last_node;
|
|
size_t bufpos;
|
|
|
|
serverAssert(c->bufpos == 0 && listLength(c->reply) == 0);
|
|
/* Determine the last block and buffer position based on thread context */
|
|
if (inMainThread()) {
|
|
last_node = listLast(server.repl_buffer_blocks);
|
|
if (!last_node) return;
|
|
bufpos = ((replBufBlock *)listNodeValue(last_node))->used;
|
|
} else {
|
|
last_node = c->io_last_reply_block;
|
|
serverAssert(last_node != NULL);
|
|
bufpos = c->io_last_bufpos;
|
|
}
|
|
|
|
listNode *first_node = c->repl_data->ref_repl_buf_node;
|
|
|
|
/* Handle the single block case */
|
|
if (first_node == last_node) {
|
|
replBufBlock *b = listNodeValue(first_node);
|
|
c->nwritten = connWrite(c->conn, b->buf + c->repl_data->ref_block_pos, bufpos - c->repl_data->ref_block_pos);
|
|
if (c->nwritten <= 0) {
|
|
c->write_flags |= WRITE_FLAGS_WRITE_ERROR;
|
|
}
|
|
return;
|
|
}
|
|
|
|
/* Multiple blocks case */
|
|
ssize_t total_bytes = 0;
|
|
int iovcnt = 0;
|
|
struct iovec iov_arr[IOV_MAX];
|
|
struct iovec *iov = iov_arr;
|
|
int iovmax = min(IOV_MAX, c->conn->iovcnt);
|
|
|
|
for (listNode *cur_node = first_node; cur_node != NULL && iovcnt < iovmax; cur_node = listNextNode(cur_node)) {
|
|
replBufBlock *cur_block = listNodeValue(cur_node);
|
|
size_t start = (cur_node == first_node) ? c->repl_data->ref_block_pos : 0;
|
|
size_t len = (cur_node == last_node) ? bufpos : cur_block->used;
|
|
len -= start;
|
|
|
|
/* For TLS, we should not call SSL_write() with num=0 */
|
|
if (unlikely(len == 0)) {
|
|
continue;
|
|
}
|
|
|
|
iov[iovcnt].iov_base = cur_block->buf + start;
|
|
iov[iovcnt].iov_len = len;
|
|
total_bytes += len;
|
|
iovcnt++;
|
|
if (cur_node == last_node) break;
|
|
}
|
|
|
|
if (total_bytes == 0) return;
|
|
|
|
ssize_t totwritten = 0;
|
|
while (iovcnt > 0) {
|
|
int nwritten = connWritev(c->conn, iov, iovcnt);
|
|
|
|
if (nwritten <= 0) {
|
|
c->write_flags |= WRITE_FLAGS_WRITE_ERROR;
|
|
c->nwritten = (totwritten > 0) ? totwritten : nwritten;
|
|
return;
|
|
}
|
|
|
|
totwritten += nwritten;
|
|
|
|
if (totwritten == total_bytes) {
|
|
break;
|
|
}
|
|
|
|
/* Update iov array */
|
|
while (nwritten > 0) {
|
|
if ((size_t)nwritten < iov[0].iov_len) {
|
|
/* partial block written */
|
|
iov[0].iov_base = (char *)iov[0].iov_base + nwritten;
|
|
iov[0].iov_len -= nwritten;
|
|
break;
|
|
}
|
|
|
|
/* full block written */
|
|
nwritten -= iov[0].iov_len;
|
|
iov++;
|
|
iovcnt--;
|
|
}
|
|
}
|
|
|
|
c->nwritten = totwritten;
|
|
}
|
|
|
|
/* Bulk string reply requires 3 iov entries -
|
|
* length prefix ($<length>\r\n), string (<data>) and suffix (\r\n) */
|
|
#define NUM_OF_IOV_PER_BULK_STR 3
|
|
/* Bulk string prefix max size (long + $ + \r\n) */
|
|
#define BULK_STR_LEN_PREFIX_MAX_SIZE (LONG_STR_SIZE + 3)
|
|
|
|
/* This struct is used by writevToClient to prepare iovec array for submitting to connWritev */
|
|
typedef struct replyIOV {
|
|
int iovcnt; /* number of elements in iov array */
|
|
int iovsize; /* capacity of iov array */
|
|
struct iovec *iov;
|
|
ssize_t iov_len_total; /* Total length of data pointed by iov array */
|
|
size_t last_written_len; /* Length of data in the last written buffer
|
|
* partially written in previous writevToClient invocation */
|
|
int limit_reached; /* Non zero if either max iov count or NET_MAX_WRITES_PER_EVENT limit
|
|
* reached during iovec array preparation */
|
|
/* Auxiliary fields for scattering BUFSTR_REF chunks from encoded buffers */
|
|
int prfxcnt; /* number of prefixes */
|
|
char (*prefixes)[BULK_STR_LEN_PREFIX_MAX_SIZE]; /* bulk string prefixes */
|
|
char *crlf; /* bulk string suffix */
|
|
} replyIOV;
|
|
|
|
/* The bufWriteMetadata struct is used by writevToClient to record metadata
|
|
* about scattering of reply buffer to iov array */
|
|
typedef struct bufWriteMetadata {
|
|
char *buf;
|
|
size_t bufpos;
|
|
uint64_t data_len; /* Actual bytes out. Differs from bufpos if buffer encoded */
|
|
int complete; /* Was the buffer completely scattered to iov or
|
|
process stopped due encountered limit */
|
|
} bufWriteMetadata;
|
|
|
|
static void initReplyIOV(client *c, int iovsize, struct iovec *iov_arr, char (*prefixes)[], char *crlf, replyIOV *reply) {
|
|
reply->iovcnt = 0;
|
|
reply->iovsize = iovsize;
|
|
reply->limit_reached = 0;
|
|
reply->iov = iov_arr;
|
|
reply->iov_len_total = 0;
|
|
reply->last_written_len = c->io_last_written.data_len;
|
|
reply->prfxcnt = 0;
|
|
reply->prefixes = prefixes;
|
|
reply->crlf = crlf;
|
|
}
|
|
|
|
static void addPlainBufferToReplyIOV(char *buf, size_t buf_len, replyIOV *reply, bufWriteMetadata *metadata) {
|
|
if (reply->limit_reached) return;
|
|
|
|
if (reply->iovcnt == reply->iovsize) {
|
|
reply->limit_reached = 1;
|
|
return;
|
|
}
|
|
|
|
/* Aggregate data length from the beginning of the buffer even though
|
|
* part of the data can be skipped in this writevToClient invocation due to last_written_len */
|
|
metadata->data_len += buf_len;
|
|
|
|
/* Skip data written in the previous writevToClient invocation(s) */
|
|
if (reply->last_written_len >= buf_len) {
|
|
reply->last_written_len -= buf_len;
|
|
return;
|
|
}
|
|
|
|
reply->iov[reply->iovcnt].iov_base = buf + reply->last_written_len;
|
|
reply->iov[reply->iovcnt].iov_len = buf_len - reply->last_written_len;
|
|
reply->last_written_len = 0;
|
|
|
|
reply->iov_len_total += reply->iov[reply->iovcnt++].iov_len;
|
|
}
|
|
|
|
static void addBulkStringToReplyIOV(char *buf, size_t buf_len, replyIOV *reply, bufWriteMetadata *metadata) {
|
|
bulkStrRef *str_ref = (bulkStrRef *)buf;
|
|
while (buf_len > 0 && !reply->limit_reached) {
|
|
size_t str_len = sdslen(str_ref->str);
|
|
|
|
/* RESP encodes bulk strings as $<length>\r\n<data>\r\n */
|
|
char *prefix = reply->prefixes[reply->prfxcnt];
|
|
prefix[0] = '$';
|
|
size_t num_len = ll2string(prefix + 1, sizeof(reply->prefixes[0]) - 3, str_len);
|
|
prefix[num_len + 1] = '\r';
|
|
prefix[num_len + 2] = '\n';
|
|
|
|
int cnt = reply->iovcnt;
|
|
addPlainBufferToReplyIOV(reply->prefixes[reply->prfxcnt], num_len + 3, reply, metadata);
|
|
/* Increment prfxcnt only if prefix was added to reply in this writevToClient invocation */
|
|
if (reply->iovcnt > cnt) reply->prfxcnt++;
|
|
addPlainBufferToReplyIOV(str_ref->str, str_len, reply, metadata);
|
|
addPlainBufferToReplyIOV(reply->crlf, 2, reply, metadata);
|
|
|
|
str_ref++;
|
|
buf_len -= sizeof(bulkStrRef);
|
|
}
|
|
}
|
|
|
|
static void addEncodedBufferToReplyIOV(char *buf, size_t bufpos, replyIOV *reply, bufWriteMetadata *metadata) {
|
|
char *ptr = buf;
|
|
while (ptr < buf + bufpos && !reply->limit_reached) {
|
|
payloadHeader *header = (payloadHeader *)ptr;
|
|
ptr += sizeof(payloadHeader);
|
|
if (header->payload_type == PLAIN_REPLY) {
|
|
addPlainBufferToReplyIOV(ptr, header->payload_len, reply, metadata);
|
|
} else {
|
|
uint64_t data_len = metadata->data_len;
|
|
addBulkStringToReplyIOV(ptr, header->payload_len, reply, metadata);
|
|
/* Store actual reply len for cluster slot stats */
|
|
header->reply_len = metadata->data_len - data_len;
|
|
}
|
|
ptr += header->payload_len;
|
|
}
|
|
}
|
|
|
|
static void addBufferToReplyIOV(int encoded, char *buf, size_t bufpos, replyIOV *reply, bufWriteMetadata *metadata) {
|
|
metadata->data_len = 0;
|
|
|
|
if (encoded) {
|
|
addEncodedBufferToReplyIOV(buf, bufpos, reply, metadata);
|
|
metadata->complete = !reply->limit_reached;
|
|
} else {
|
|
addPlainBufferToReplyIOV(buf, bufpos, reply, metadata);
|
|
metadata->complete = 1;
|
|
}
|
|
|
|
if (reply->iov_len_total > NET_MAX_WRITES_PER_EVENT) {
|
|
reply->limit_reached = 1;
|
|
}
|
|
|
|
metadata->buf = buf;
|
|
metadata->bufpos = bufpos;
|
|
}
|
|
|
|
/*
|
|
* This function calculates and stores on the client next:
|
|
* io_last_written_buf - Last buffer that has been written to the client connection
|
|
* io_last_written_bufpos - The buffer has been written until this position
|
|
* io_last_written_data_len - The actual length of the data written from this buffer
|
|
* This length differs from written bufpos in case of copy avoidance
|
|
*
|
|
* The io_last_written_buf and io_last_written_bufpos are used by _postWriteToClient
|
|
* to detect last client reply buffer that can be released
|
|
*
|
|
* The io_last_written_data_len is used by writevToClient for resuming write from the point
|
|
* where previous writevToClient invocation stopped
|
|
**/
|
|
static void saveLastWrittenBuf(client *c, bufWriteMetadata *metadata, int bufcnt, size_t totlen, size_t totwritten) {
|
|
int last = bufcnt - 1;
|
|
if (totwritten == totlen) {
|
|
c->io_last_written.buf = metadata[last].buf;
|
|
/* Zero io_last_written.bufpos indicates buffer written incompletely */
|
|
c->io_last_written.bufpos = (metadata[last].complete ? metadata[last].bufpos : 0);
|
|
c->io_last_written.data_len = metadata[last].data_len;
|
|
return;
|
|
}
|
|
|
|
last = -1;
|
|
int64_t remaining = totwritten + c->io_last_written.data_len;
|
|
while (remaining > 0) remaining -= metadata[++last].data_len;
|
|
serverAssert(last < bufcnt);
|
|
|
|
c->io_last_written.buf = metadata[last].buf;
|
|
/* Zero io_last_written_bufpos indicates buffer written incompletely */
|
|
c->io_last_written.bufpos = (metadata[last].complete && remaining == 0 ? metadata[last].bufpos : 0);
|
|
c->io_last_written.data_len = (size_t)(metadata[last].data_len + remaining);
|
|
}
|
|
|
|
/* Adjust reply->iov to point to start of unwritten blocks */
|
|
static void proceedToUnwritten(replyIOV *reply, int nwritten) {
|
|
while (nwritten > 0) {
|
|
if ((size_t)nwritten < reply->iov[0].iov_len) {
|
|
reply->iov[0].iov_base = (char *)reply->iov[0].iov_base + nwritten;
|
|
reply->iov[0].iov_len -= nwritten;
|
|
break;
|
|
}
|
|
nwritten -= reply->iov[0].iov_len;
|
|
reply->iov++;
|
|
reply->iovcnt--;
|
|
}
|
|
}
|
|
|
|
/* This function should be called from _writeToClient when the reply list is not empty,
|
|
* it gathers the scattered buffers from reply list and sends them away with connWritev.
|
|
* If we write successfully, it returns C_OK, otherwise, C_ERR is returned.
|
|
* Sets the c->nwritten to the number of bytes the server wrote to the client.
|
|
* Can be called from the main thread or an I/O thread */
|
|
static int writevToClient(client *c) {
|
|
int iovmax = min(IOV_MAX, c->conn->iovcnt);
|
|
struct iovec iov_arr[iovmax];
|
|
/* iov_arr can accommodate iovmax / NUM_OF_IOV_PER_BULK_STR full bulk string replies
|
|
* and one partial bulk reply */
|
|
char prefixes[iovmax / NUM_OF_IOV_PER_BULK_STR + 1][BULK_STR_LEN_PREFIX_MAX_SIZE];
|
|
char crlf[2] = {'\r', '\n'};
|
|
size_t bufcnt = 0;
|
|
|
|
size_t bufpos = 0;
|
|
listNode *lastblock;
|
|
if (inMainThread()) {
|
|
lastblock = listLast(c->reply);
|
|
bufpos = c->bufpos;
|
|
} else {
|
|
lastblock = c->io_last_reply_block;
|
|
bufpos = lastblock ? (size_t)c->bufpos : c->io_last_bufpos;
|
|
}
|
|
|
|
int reply_blocks = (lastblock ? listLength(c->reply) : 0);
|
|
/* +1 is for c->buf */
|
|
size_t replyLen = min(reply_blocks + 1, iovmax);
|
|
bufWriteMetadata buf_metadata[replyLen];
|
|
|
|
replyIOV reply;
|
|
initReplyIOV(c, iovmax, iov_arr, prefixes, crlf, &reply);
|
|
|
|
/* If the static reply buffer is not empty,
|
|
* add it to the iov array for writev() as well. */
|
|
if (bufpos > 0) {
|
|
addBufferToReplyIOV(c->flag.buf_encoded, c->buf, bufpos, &reply, &buf_metadata[bufcnt++]);
|
|
}
|
|
|
|
if (lastblock) {
|
|
listIter iter;
|
|
listNode *next;
|
|
listRewind(c->reply, &iter);
|
|
while ((next = listNext(&iter)) && !reply.limit_reached) {
|
|
clientReplyBlock *o = listNodeValue(next);
|
|
|
|
size_t used = o->used;
|
|
/* Use c->io_last_bufpos as the currently used portion of the block.
|
|
* We use io_last_bufpos instead of o->used to ensure that we only access data guaranteed to be visible to the
|
|
* current thread. Using o->used, which may have been updated by the main thread, could lead to accessing data
|
|
* that may not yet be visible to the current thread*/
|
|
if (!inMainThread() && next == lastblock) used = c->io_last_bufpos;
|
|
|
|
if (used == 0) { /* empty node, skip over it. */
|
|
if (next == lastblock) break;
|
|
continue;
|
|
}
|
|
|
|
addBufferToReplyIOV(o->flag.buf_encoded, o->buf, used, &reply, &buf_metadata[bufcnt]);
|
|
if (!buf_metadata[bufcnt].data_len) break;
|
|
bufcnt++;
|
|
|
|
if (next == lastblock) break;
|
|
|
|
if (reply.iovcnt == reply.iovsize) {
|
|
reply.limit_reached = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
ssize_t totwritten = 0;
|
|
while (1) {
|
|
int nwritten = connWritev(c->conn, reply.iov, reply.iovcnt);
|
|
if (nwritten <= 0) {
|
|
c->write_flags |= WRITE_FLAGS_WRITE_ERROR;
|
|
totwritten = totwritten > 0 ? totwritten : nwritten;
|
|
break;
|
|
}
|
|
totwritten += nwritten;
|
|
|
|
if (totwritten == reply.iov_len_total) break;
|
|
|
|
if (totwritten > NET_MAX_WRITES_PER_EVENT) {
|
|
/* Note that we avoid to send more than NET_MAX_WRITES_PER_EVENT
|
|
* bytes, Since it's a good idea to serve
|
|
* other clients as well, even if a very large request comes from
|
|
* super fast link that is always able to accept data (in real world
|
|
* scenario think about 'KEYS *' against the loopback interface).
|
|
*
|
|
* However if we are over the maxmemory limit we ignore that and
|
|
* just deliver as much data as it is possible to deliver. */
|
|
int ignore_max_write_limit = server.maxmemory > 0 && zmalloc_used_memory() > server.maxmemory;
|
|
if (!ignore_max_write_limit) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
proceedToUnwritten(&reply, nwritten);
|
|
}
|
|
|
|
c->nwritten = totwritten;
|
|
if (totwritten > 0) {
|
|
saveLastWrittenBuf(c, buf_metadata, bufcnt, reply.iov_len_total, totwritten);
|
|
}
|
|
return totwritten > 0 ? C_OK : C_ERR;
|
|
}
|
|
|
|
/* This function does actual writing output buffers to non-replica client, it is called by writeToClient.
|
|
* If we write successfully, it returns C_OK, otherwise, C_ERR is returned,
|
|
* and 'c->nwritten' is set to the number of bytes the server wrote to the client. */
|
|
int _writeToClient(client *c) {
|
|
listNode *lastblock;
|
|
size_t bufpos;
|
|
|
|
if (inMainThread()) {
|
|
/* In the main thread, access bufpos and lastblock directly */
|
|
lastblock = listLast(c->reply);
|
|
bufpos = (size_t)c->bufpos;
|
|
} else {
|
|
/* If there is a last block, use bufpos directly; otherwise, use io_last_bufpos */
|
|
bufpos = c->io_last_reply_block ? (size_t)c->bufpos : c->io_last_bufpos;
|
|
lastblock = c->io_last_reply_block;
|
|
}
|
|
|
|
/* If the reply list is not empty or buffer is encoded,
|
|
* use writev to save system calls and TCP packets */
|
|
if (lastblock || c->flag.buf_encoded) return writevToClient(c);
|
|
|
|
/* If io_last_written_data_len is nonzero it must relate to c->buf */
|
|
serverAssert(c->io_last_written.data_len == 0 || c->io_last_written.buf == c->buf);
|
|
ssize_t bytes_to_write = bufpos - c->io_last_written.data_len;
|
|
ssize_t tot_written = 0;
|
|
|
|
while (tot_written < bytes_to_write) {
|
|
int nwritten = connWrite(c->conn, c->buf + c->io_last_written.data_len + tot_written, bytes_to_write - tot_written);
|
|
if (nwritten <= 0) {
|
|
c->write_flags |= WRITE_FLAGS_WRITE_ERROR;
|
|
tot_written = tot_written > 0 ? tot_written : nwritten;
|
|
break;
|
|
}
|
|
tot_written += nwritten;
|
|
}
|
|
|
|
c->nwritten = tot_written;
|
|
if (tot_written > 0) {
|
|
c->io_last_written.buf = c->buf;
|
|
c->io_last_written.bufpos = (tot_written == bytes_to_write ? bufpos : 0);
|
|
c->io_last_written.data_len = c->io_last_written.data_len + tot_written;
|
|
}
|
|
return tot_written > 0 ? C_OK : C_ERR;
|
|
}
|
|
|
|
void resetLastWrittenBuf(client *c) {
|
|
c->io_last_written.buf = NULL;
|
|
c->io_last_written.bufpos = 0;
|
|
c->io_last_written.data_len = 0;
|
|
}
|
|
|
|
/* Release references to string objects inside an encoded buffer */
|
|
static void releaseBufReferences(char *buf, size_t bufpos) {
|
|
char *ptr = buf;
|
|
while (ptr < buf + bufpos) {
|
|
payloadHeader *header = (payloadHeader *)ptr;
|
|
ptr += sizeof(payloadHeader);
|
|
|
|
if (header->payload_type == BULK_STR_REF) {
|
|
clusterSlotStatsAddNetworkBytesOutForSlot(header->slot, header->reply_len);
|
|
|
|
bulkStrRef *str_ref = (bulkStrRef *)ptr;
|
|
size_t len = header->payload_len;
|
|
while (len > 0) {
|
|
decrRefCount(str_ref->obj);
|
|
str_ref++;
|
|
len -= sizeof(bulkStrRef);
|
|
}
|
|
} else {
|
|
serverAssert(header->payload_type == PLAIN_REPLY);
|
|
}
|
|
|
|
ptr += header->payload_len;
|
|
}
|
|
serverAssert(ptr == buf + bufpos);
|
|
}
|
|
|
|
void releaseReplyReferences(client *c) {
|
|
if (c->bufpos > 0 && c->flag.buf_encoded) {
|
|
releaseBufReferences(c->buf, c->bufpos);
|
|
}
|
|
|
|
listIter iter;
|
|
listNode *next;
|
|
listRewind(c->reply, &iter);
|
|
while ((next = listNext(&iter))) {
|
|
clientReplyBlock *o = (clientReplyBlock *)listNodeValue(next);
|
|
if (o->flag.buf_encoded) {
|
|
releaseBufReferences(o->buf, o->used);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void _postWriteToClient(client *c) {
|
|
if (c->nwritten <= 0) return;
|
|
if (getClientType(c) == CLIENT_TYPE_SLOT_EXPORT) {
|
|
server.stat_net_cluster_slot_export_bytes += c->nwritten;
|
|
} else {
|
|
server.stat_net_output_bytes += c->nwritten;
|
|
}
|
|
|
|
int last_written = 0;
|
|
if (c->bufpos > 0) {
|
|
/* Is this buffer is last written? */
|
|
last_written = (c->buf == c->io_last_written.buf);
|
|
/* If buffer is completely written */
|
|
if (!last_written || c->bufpos == c->io_last_written.bufpos) {
|
|
/* If encoded then release references to bulk string objects */
|
|
if (c->flag.buf_encoded) releaseBufReferences(c->buf, c->bufpos);
|
|
/* Reset buffer metadata */
|
|
c->bufpos = 0;
|
|
c->flag.buf_encoded = 0;
|
|
c->last_header = NULL;
|
|
/* If completely written buffer is last written then reset last written state */
|
|
if (last_written) resetLastWrittenBuf(c);
|
|
}
|
|
if (last_written) return;
|
|
}
|
|
|
|
listIter iter;
|
|
listNode *next;
|
|
listRewind(c->reply, &iter);
|
|
while ((next = listNext(&iter))) {
|
|
clientReplyBlock *o = listNodeValue(next);
|
|
/* Is this buffer is last written? */
|
|
last_written = (o->buf == c->io_last_written.buf);
|
|
/* If buffer is completely written */
|
|
if (!last_written || o->used == c->io_last_written.bufpos) {
|
|
c->reply_bytes -= o->size;
|
|
/* If encoded then release references to bulk string objects */
|
|
if (o->flag.buf_encoded) releaseBufReferences(o->buf, o->used);
|
|
listDelNode(c->reply, next);
|
|
/* If completely written buffer is last written then reset last written state */
|
|
if (last_written) resetLastWrittenBuf(c);
|
|
}
|
|
if (last_written) return;
|
|
}
|
|
}
|
|
|
|
/* Updates the client's memory usage and bucket and server stats after writing.
|
|
* If a write handler is installed , it will attempt to clear the write event.
|
|
* If the client is no longer valid, it will return C_ERR, otherwise C_OK. */
|
|
int postWriteToClient(client *c) {
|
|
c->io_last_reply_block = NULL;
|
|
c->io_last_bufpos = 0;
|
|
/* Update total number of writes on server */
|
|
server.stat_total_writes_processed++;
|
|
if (getClientType(c) != CLIENT_TYPE_REPLICA) {
|
|
_postWriteToClient(c);
|
|
} else {
|
|
postWriteToReplica(c);
|
|
}
|
|
|
|
if (c->write_flags & WRITE_FLAGS_WRITE_ERROR) {
|
|
if (connGetState(c->conn) != CONN_STATE_CONNECTED) {
|
|
serverLog(LL_VERBOSE, "Error writing to client: %s", connGetLastError(c->conn));
|
|
freeClientAsync(c);
|
|
return C_ERR;
|
|
}
|
|
}
|
|
if (c->nwritten > 0) {
|
|
c->net_output_bytes += c->nwritten;
|
|
/* For replicated clients we don't count sending data
|
|
* as an interaction, since we always send ACK commands
|
|
* that take some time to just fill the socket output buffer.
|
|
* We just rely on data / pings received for timeout detection. */
|
|
if (!isReplicatedClient(c)) c->last_interaction = server.unixtime;
|
|
}
|
|
if (!clientHasPendingReplies(c)) {
|
|
resetLastWrittenBuf(c);
|
|
if (connHasWriteHandler(c->conn)) {
|
|
connSetWriteHandler(c->conn, NULL);
|
|
}
|
|
|
|
/* Close connection after entire reply has been sent. */
|
|
if (c->flag.close_after_reply) {
|
|
freeClientAsync(c);
|
|
return C_ERR;
|
|
}
|
|
}
|
|
/* Update client's memory usage after writing.*/
|
|
updateClientMemUsageAndBucket(c);
|
|
return C_OK;
|
|
}
|
|
|
|
/* Write data in output buffers to client. Return C_OK if the client
|
|
* is still valid after the call, C_ERR if it was freed because of some
|
|
* error.
|
|
*
|
|
* This function is called by main-thread only */
|
|
int writeToClient(client *c) {
|
|
if (c->io_write_state != CLIENT_IDLE || c->io_read_state != CLIENT_IDLE) return C_OK;
|
|
|
|
c->nwritten = 0;
|
|
c->write_flags = 0;
|
|
|
|
if (getClientType(c) == CLIENT_TYPE_REPLICA) {
|
|
writeToReplica(c);
|
|
} else {
|
|
_writeToClient(c);
|
|
}
|
|
|
|
return postWriteToClient(c);
|
|
}
|
|
|
|
/* Write event handler. Just send data to the client. */
|
|
void sendReplyToClient(connection *conn) {
|
|
client *c = connGetPrivateData(conn);
|
|
if (trySendWriteToIOThreads(c) == C_OK) return;
|
|
writeToClient(c);
|
|
}
|
|
|
|
void handleQbLimitReached(client *c) {
|
|
sds ci = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log), bytes = sdsempty();
|
|
bytes = sdscatrepr(bytes, c->querybuf, 64);
|
|
serverLog(LL_WARNING, "Closing client that reached max query buffer length: %s (qbuf initial bytes: %s)", ci,
|
|
bytes);
|
|
sdsfree(ci);
|
|
sdsfree(bytes);
|
|
freeClientAsync(c);
|
|
server.stat_client_qbuf_limit_disconnections++;
|
|
}
|
|
|
|
/* Handle read errors and update statistics.
|
|
*
|
|
* Called only from the main thread.
|
|
* If the read was done in an I/O thread, this function is invoked after the
|
|
* read job has completed, in the main thread context.
|
|
*
|
|
* Returns:
|
|
* - C_OK if the querybuf can be further processed.
|
|
* - C_ERR if not. */
|
|
int handleReadResult(client *c) {
|
|
serverAssert(inMainThread());
|
|
server.stat_total_reads_processed++;
|
|
if (c->nread <= 0) {
|
|
if (c->nread == -1) {
|
|
if (connGetState(c->conn) != CONN_STATE_CONNECTED) {
|
|
serverLog(LL_VERBOSE, "Reading from client: %s", connGetLastError(c->conn));
|
|
freeClientAsync(c);
|
|
}
|
|
} else if (c->nread == 0) {
|
|
if (server.verbosity <= LL_VERBOSE) {
|
|
sds info = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log);
|
|
serverLog(LL_VERBOSE, "Client closed connection %s", info);
|
|
sdsfree(info);
|
|
}
|
|
freeClientAsync(c);
|
|
}
|
|
return C_ERR;
|
|
}
|
|
|
|
c->last_interaction = server.unixtime;
|
|
c->net_input_bytes += c->nread;
|
|
if (isReplicatedClient(c)) {
|
|
c->repl_data->read_reploff += c->nread;
|
|
if (getClientType(c) == CLIENT_TYPE_PRIMARY) {
|
|
server.stat_net_repl_input_bytes += c->nread;
|
|
} else {
|
|
server.stat_net_cluster_slot_import_bytes += c->nread;
|
|
}
|
|
} else {
|
|
server.stat_net_input_bytes += c->nread;
|
|
}
|
|
|
|
/* Handle QB limit */
|
|
if (c->read_flags & READ_FLAGS_QB_LIMIT_REACHED) {
|
|
handleQbLimitReached(c);
|
|
return C_ERR;
|
|
}
|
|
return C_OK;
|
|
}
|
|
|
|
|
|
void handleParseError(client *c) {
|
|
int flags = c->read_flags;
|
|
if (flags & READ_FLAGS_ERROR_BIG_INLINE_REQUEST) {
|
|
addReplyError(c, "Protocol error: too big inline request");
|
|
setProtocolError("too big inline request", c);
|
|
} else if (flags & READ_FLAGS_ERROR_BIG_MULTIBULK) {
|
|
addReplyError(c, "Protocol error: too big mbulk count string");
|
|
setProtocolError("too big mbulk count string", c);
|
|
} else if (flags & READ_FLAGS_ERROR_INVALID_MULTIBULK_LEN) {
|
|
addReplyError(c, "Protocol error: invalid multibulk length");
|
|
setProtocolError("invalid mbulk count", c);
|
|
} else if (flags & READ_FLAGS_ERROR_UNAUTHENTICATED_MULTIBULK_LEN) {
|
|
addReplyError(c, "Protocol error: unauthenticated multibulk length");
|
|
setProtocolError("unauth mbulk count", c);
|
|
} else if (flags & READ_FLAGS_ERROR_UNAUTHENTICATED_BULK_LEN) {
|
|
addReplyError(c, "Protocol error: unauthenticated bulk length");
|
|
setProtocolError("unauth bulk length", c);
|
|
} else if (flags & READ_FLAGS_ERROR_BIG_BULK_COUNT) {
|
|
addReplyError(c, "Protocol error: too big bulk count string");
|
|
setProtocolError("too big bulk count string", c);
|
|
} else if (flags & READ_FLAGS_ERROR_MBULK_UNEXPECTED_CHARACTER) {
|
|
addReplyErrorFormat(c, "Protocol error: expected '$', got '%c'", c->querybuf[c->qb_pos]);
|
|
setProtocolError("expected $ but got something else", c);
|
|
} else if (flags & READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN) {
|
|
addReplyError(c, "Protocol error: invalid bulk length");
|
|
setProtocolError("invalid bulk length", c);
|
|
} else if (flags & READ_FLAGS_ERROR_UNBALANCED_QUOTES) {
|
|
addReplyError(c, "Protocol error: unbalanced quotes in request");
|
|
setProtocolError("unbalanced quotes in inline request", c);
|
|
} else if (flags & READ_FLAGS_ERROR_INVALID_CRLF) {
|
|
addReplyError(c, "Protocol error: invalid CRLF in request");
|
|
setProtocolError("invalid CRLF in request", c);
|
|
} else if (flags & READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_REPLICATED_CLIENT) {
|
|
if (getClientType(c) == CLIENT_TYPE_SLOT_IMPORT) {
|
|
serverLog(LL_WARNING, "WARNING: Receiving inline protocol from slot import, import stream corruption? Closing the "
|
|
"slot import connection.");
|
|
setProtocolError("Import using the inline protocol. Desync?", c);
|
|
} else {
|
|
serverLog(LL_WARNING, "WARNING: Receiving inline protocol from primary, primary stream corruption? Closing the "
|
|
"primary connection and discarding the cached primary.");
|
|
setProtocolError("Master using the inline protocol. Desync?", c);
|
|
}
|
|
} else {
|
|
serverAssertWithInfo(c, NULL, "Unknown parsing error");
|
|
}
|
|
}
|
|
|
|
int isParsingError(client *c) {
|
|
return c->read_flags & (READ_FLAGS_ERROR_BIG_INLINE_REQUEST | READ_FLAGS_ERROR_BIG_MULTIBULK |
|
|
READ_FLAGS_ERROR_INVALID_MULTIBULK_LEN | READ_FLAGS_ERROR_UNAUTHENTICATED_MULTIBULK_LEN |
|
|
READ_FLAGS_ERROR_UNAUTHENTICATED_BULK_LEN | READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN |
|
|
READ_FLAGS_ERROR_BIG_BULK_COUNT | READ_FLAGS_ERROR_MBULK_UNEXPECTED_CHARACTER |
|
|
READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_REPLICATED_CLIENT | READ_FLAGS_ERROR_UNBALANCED_QUOTES |
|
|
READ_FLAGS_ERROR_INVALID_CRLF);
|
|
}
|
|
|
|
/* This function is called after the query-buffer was parsed.
|
|
* It is used to handle parsing errors and to update the client state.
|
|
* The function returns C_OK if a command can be executed, otherwise C_ERR. */
|
|
parseResult handleParseResults(client *c) {
|
|
if (isParsingError(c)) {
|
|
handleParseError(c);
|
|
return PARSE_ERR;
|
|
}
|
|
|
|
if (c->read_flags & READ_FLAGS_INLINE_ZERO_QUERY_LEN && getClientType(c) == CLIENT_TYPE_REPLICA) {
|
|
c->repl_data->repl_ack_time = server.unixtime;
|
|
}
|
|
|
|
if (c->read_flags & READ_FLAGS_INLINE_ZERO_QUERY_LEN) {
|
|
/* in case the client's query was an empty line we will ignore it and proceed to process the rest of the buffer
|
|
* if any */
|
|
resetClient(c);
|
|
return PARSE_OK;
|
|
}
|
|
|
|
if (c->read_flags & READ_FLAGS_PARSING_NEGATIVE_MBULK_LEN) {
|
|
/* Multibulk processing could see a <= 0 length. */
|
|
resetClient(c);
|
|
return PARSE_OK;
|
|
}
|
|
|
|
if (c->read_flags & READ_FLAGS_PARSING_COMPLETED) {
|
|
return PARSE_OK;
|
|
} else {
|
|
return PARSE_NEEDMORE;
|
|
}
|
|
}
|
|
|
|
/* Process the completion of an IO write operation for a client.
|
|
* This function handles various post-write tasks, including updating client state,
|
|
* allow_async_writes - A flag indicating whether I/O threads can handle pending writes for this client.
|
|
* returns 1 if processing completed successfully, 0 if processing is skipped. */
|
|
int processClientIOWriteDone(client *c, int allow_async_writes) {
|
|
/* memory barrier acquire to get the latest client state */
|
|
atomic_thread_fence(memory_order_acquire);
|
|
/* If a client is protected, don't proceed to check the write results as it may trigger conn close. */
|
|
if (c->flag.protected) return 0;
|
|
|
|
listUnlinkNode(server.clients_pending_io_write, &c->clients_pending_write_node);
|
|
c->flag.pending_write = 0;
|
|
c->io_write_state = CLIENT_IDLE;
|
|
|
|
/* Don't post-process-writes to clients that are going to be closed anyway. */
|
|
if (c->flag.close_asap) return 0;
|
|
|
|
/* Update processed count on server */
|
|
server.stat_io_writes_processed += 1;
|
|
|
|
connSetPostponeUpdateState(c->conn, 0);
|
|
connUpdateState(c->conn);
|
|
if (postWriteToClient(c) == C_ERR) {
|
|
return 1;
|
|
}
|
|
|
|
if (clientHasPendingReplies(c)) {
|
|
if (c->write_flags & WRITE_FLAGS_WRITE_ERROR) {
|
|
/* Install the write handler if there are pending writes in some of the clients as a result of not being
|
|
* able to write everything in one go. */
|
|
installClientWriteHandler(c);
|
|
} else {
|
|
/* If we can send the client to the I/O thread, let it handle the write. */
|
|
if (allow_async_writes && trySendWriteToIOThreads(c) == C_OK) return 1;
|
|
/* Try again in the next eventloop */
|
|
putClientInPendingWriteQueue(c);
|
|
}
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
/* This function handles the post-processing of I/O write operations that have been
|
|
* completed for clients. It iterates through the list of clients with pending I/O
|
|
* writes and performs necessary actions based on their current state.
|
|
*
|
|
* Returns The number of clients processed during this function call. */
|
|
int processIOThreadsWriteDone(void) {
|
|
if (listLength(server.clients_pending_io_write) == 0) return 0;
|
|
int processed = 0;
|
|
listNode *ln;
|
|
|
|
listNode *next = listFirst(server.clients_pending_io_write);
|
|
while (next) {
|
|
ln = next;
|
|
next = listNextNode(ln);
|
|
client *c = listNodeValue(ln);
|
|
|
|
/* Client is still waiting for a pending I/O - skip it */
|
|
if (c->io_write_state == CLIENT_PENDING_IO || c->io_read_state == CLIENT_PENDING_IO) continue;
|
|
|
|
processed += processClientIOWriteDone(c, 1);
|
|
}
|
|
|
|
return processed;
|
|
}
|
|
|
|
/* This function is called just before entering the event loop, in the hope
|
|
* we can just write the replies to the client output buffer without any
|
|
* need to use a syscall in order to install the writable event handler,
|
|
* get it called, and so forth. */
|
|
int handleClientsWithPendingWrites(void) {
|
|
int processed = 0;
|
|
int pending_writes = listLength(server.clients_pending_write);
|
|
if (pending_writes == 0) return processed; /* Return ASAP if there are no clients. */
|
|
|
|
/* Adjust the number of I/O threads based on the number of pending writes this is required in case pending_writes >
|
|
* poll_events (for example in pubsub) */
|
|
adjustIOThreadsByEventLoad(pending_writes, 1);
|
|
|
|
listIter li;
|
|
listNode *ln;
|
|
listRewind(server.clients_pending_write, &li);
|
|
while ((ln = listNext(&li))) {
|
|
client *c = listNodeValue(ln);
|
|
c->flag.pending_write = 0;
|
|
listUnlinkNode(server.clients_pending_write, ln);
|
|
|
|
/* If a client is protected, don't do anything,
|
|
* that may trigger write error or recreate handler. */
|
|
if (c->flag.protected) continue;
|
|
|
|
/* Don't write to clients that are going to be closed anyway. */
|
|
if (c->flag.close_asap) continue;
|
|
|
|
if (!clientHasPendingReplies(c)) continue;
|
|
|
|
/* If we can send the client to the I/O thread, let it handle the write. */
|
|
if (trySendWriteToIOThreads(c) == C_OK) continue;
|
|
|
|
/* We can't write to the client while IO operation is in progress. */
|
|
if (c->io_write_state != CLIENT_IDLE || c->io_read_state != CLIENT_IDLE) continue;
|
|
|
|
processed++;
|
|
|
|
/* Try to write buffers to the client socket. */
|
|
if (writeToClient(c) == C_ERR) continue;
|
|
|
|
/* If after the synchronous writes above we still have data to
|
|
* output to the client, we need to install the writable handler. */
|
|
if (clientHasPendingReplies(c)) {
|
|
installClientWriteHandler(c);
|
|
}
|
|
}
|
|
return processed;
|
|
}
|
|
|
|
/* resetClient prepare the client to process the next command */
|
|
void resetClient(client *c) {
|
|
serverCommandProc *prevcmd = c->cmd ? c->cmd->proc : NULL;
|
|
serverCommandProc *prevParentCmd = c->cmd && c->cmd->parent ? c->cmd->parent->proc : NULL;
|
|
|
|
freeClientArgv(c);
|
|
freeClientOriginalArgv(c);
|
|
c->cur_script = NULL;
|
|
c->net_input_bytes_curr_cmd = 0;
|
|
c->slot = -1;
|
|
c->flag.executing_command = 0;
|
|
c->flag.replication_done = 0;
|
|
c->flag.buffered_reply = 0;
|
|
c->flag.keyspace_notified = 0;
|
|
c->net_output_bytes_curr_cmd = 0;
|
|
|
|
/* Make sure the duration has been recorded to some command. */
|
|
serverAssert(c->duration == 0);
|
|
#ifdef LOG_REQ_RES
|
|
reqresReset(c, 1);
|
|
#endif
|
|
|
|
if (c->deferred_reply_errors) listRelease(c->deferred_reply_errors);
|
|
c->deferred_reply_errors = NULL;
|
|
commitDeferredReplyBuffer(c, 1);
|
|
|
|
/* We clear the ASKING flag as well if we are not inside a MULTI, and
|
|
* if what we just executed is not the ASKING command itself. */
|
|
if (!c->flag.multi && prevcmd != askingCommand) c->flag.asking = 0;
|
|
|
|
/* We do the same for the CACHING command as well. It also affects
|
|
* the next command or transaction executed, in a way very similar
|
|
* to ASKING. */
|
|
if (!c->flag.multi && prevParentCmd != clientCommand) c->flag.tracking_caching = 0;
|
|
|
|
/* Remove the CLIENT_REPLY_SKIP flag if any so that the reply
|
|
* to the next command will be sent, but set the flag if the command
|
|
* we just processed was "CLIENT REPLY SKIP". */
|
|
c->flag.reply_skip = 0;
|
|
if (c->flag.reply_skip_next) {
|
|
c->flag.reply_skip = 1;
|
|
c->flag.reply_skip_next = 0;
|
|
}
|
|
}
|
|
|
|
void resetClientIOState(client *c) {
|
|
c->nwritten = 0;
|
|
c->nread = 0;
|
|
c->io_read_state = c->io_write_state = CLIENT_IDLE;
|
|
c->parsed_cmd = NULL;
|
|
c->flag.pending_command = 0;
|
|
c->io_last_bufpos = 0;
|
|
c->io_last_reply_block = NULL;
|
|
}
|
|
|
|
/* Initializes the shared query buffer to a new sds with the default capacity.
|
|
* Need to ensure the initlen is not less than readlen in readToQueryBuf. */
|
|
void initSharedQueryBuf(void) {
|
|
thread_shared_qb = sdsnewlen(NULL, PROTO_IOBUF_LEN);
|
|
sdsclear(thread_shared_qb);
|
|
}
|
|
|
|
void freeSharedQueryBuf(void *dummy) {
|
|
UNUSED(dummy);
|
|
sdsfree(thread_shared_qb);
|
|
thread_shared_qb = NULL;
|
|
}
|
|
|
|
/* This function is used when we want to re-enter the event loop but there
|
|
* is the risk that the client we are dealing with will be freed in some
|
|
* way. This happens for instance in:
|
|
*
|
|
* * DEBUG RELOAD and similar.
|
|
* * When a Lua script is in -BUSY state.
|
|
* * A cluster replica executing CLUSTER SETSLOT during slot migration.
|
|
*
|
|
* So the function will protect the client by doing two things:
|
|
*
|
|
* 1) It removes the file events. This way it is not possible that an
|
|
* error is signaled on the socket, freeing the client.
|
|
* 2) Moreover it makes sure that if the client is freed in a different code
|
|
* path, it is not really released, but only marked for later release. */
|
|
void protectClient(client *c) {
|
|
c->flag.protected = 1;
|
|
if (c->conn) {
|
|
connSetReadHandler(c->conn, NULL);
|
|
connSetWriteHandler(c->conn, NULL);
|
|
}
|
|
}
|
|
|
|
/* This will undo the client protection done by protectClient() */
|
|
void unprotectClient(client *c) {
|
|
if (c->flag.protected) {
|
|
c->flag.protected = 0;
|
|
if (c->conn) {
|
|
connSetReadHandler(c->conn, readQueryFromClient);
|
|
if (clientHasPendingReplies(c)) putClientInPendingWriteQueue(c);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Like parseMultibulkBuffer(), but for the inline protocol instead of RESP,
|
|
* this function consumes the client query buffer and creates a command ready
|
|
* to be executed inside the client structure.
|
|
* Sets the client read_flags to indicate the parsing outcome. */
|
|
void parseInlineBuffer(client *c) {
|
|
char *newline;
|
|
int argc, j, linefeed_chars = 1;
|
|
sds *argv, aux;
|
|
size_t querylen;
|
|
int is_replicated = c->read_flags & READ_FLAGS_REPLICATED;
|
|
|
|
/* Search for end of line */
|
|
newline = strchr(c->querybuf + c->qb_pos, '\n');
|
|
|
|
/* Nothing to do without a \r\n */
|
|
if (newline == NULL) {
|
|
if (sdslen(c->querybuf) - c->qb_pos > PROTO_INLINE_MAX_SIZE) {
|
|
c->read_flags |= READ_FLAGS_ERROR_BIG_INLINE_REQUEST;
|
|
}
|
|
return;
|
|
}
|
|
|
|
/* Handle the \r\n case. */
|
|
if (newline != c->querybuf + c->qb_pos && *(newline - 1) == '\r') newline--, linefeed_chars++;
|
|
|
|
/* Split the input buffer up to the \r\n */
|
|
querylen = newline - (c->querybuf + c->qb_pos);
|
|
aux = sdsnewlen(c->querybuf + c->qb_pos, querylen);
|
|
argv = sdssplitargs(aux, &argc);
|
|
sdsfree(aux);
|
|
if (argv == NULL) {
|
|
c->read_flags |= READ_FLAGS_ERROR_UNBALANCED_QUOTES;
|
|
return;
|
|
}
|
|
|
|
if (querylen == 0) {
|
|
c->read_flags |= READ_FLAGS_INLINE_ZERO_QUERY_LEN;
|
|
}
|
|
|
|
/* Primaries should never send us inline protocol to run actual
|
|
* commands. If this happens, it is likely due to a bug in the server where
|
|
* we got some desynchronization in the protocol, for example
|
|
* because of a PSYNC gone bad.
|
|
*
|
|
* However there is an exception: primaries may send us just a newline
|
|
* to keep the connection active. */
|
|
if (querylen != 0 && is_replicated) {
|
|
sdsfreesplitres(argv, argc);
|
|
c->read_flags |= READ_FLAGS_ERROR_UNEXPECTED_INLINE_FROM_REPLICATED_CLIENT;
|
|
return;
|
|
}
|
|
|
|
/* Move querybuffer position to the next query in the buffer. */
|
|
c->qb_pos += querylen + linefeed_chars;
|
|
|
|
/* Setup argv array on client structure */
|
|
if (argc) {
|
|
if (c->argv) zfree(c->argv);
|
|
c->argv_len = argc;
|
|
c->argv = zmalloc(sizeof(robj *) * c->argv_len);
|
|
c->argv_len_sum = 0;
|
|
}
|
|
|
|
/* Create an Object for all arguments. */
|
|
for (c->argc = 0, j = 0; j < argc; j++) {
|
|
/* Strings returned from sdssplitargs() may have unused capacity that we can trim. */
|
|
argv[j] = sdsRemoveFreeSpace(argv[j], 1);
|
|
c->argv[c->argc] = createObject(OBJ_STRING, argv[j]);
|
|
c->argc++;
|
|
c->argv_len_sum += sdslen(argv[j]);
|
|
}
|
|
zfree(argv);
|
|
|
|
/* Per-slot network bytes-in calculation.
|
|
*
|
|
* We calculate and store the current command's ingress bytes under
|
|
* c->net_input_bytes_curr_cmd, for which its per-slot aggregation is deferred
|
|
* until c->slot is parsed later within processCommand().
|
|
*
|
|
* Calculation: For inline buffer, every whitespace is of length 1,
|
|
* with the exception of the trailing '\r\n' being length 2.
|
|
*
|
|
* For example;
|
|
* Command) SET key value
|
|
* Inline) SET key value\r\n
|
|
* */
|
|
c->net_input_bytes_curr_cmd = (c->argv_len_sum + (c->argc - 1) + 2);
|
|
c->read_flags |= READ_FLAGS_PARSING_COMPLETED;
|
|
c->reqtype = 0;
|
|
}
|
|
|
|
/* Helper function. Record protocol error details in server log,
|
|
* and set the client as CLIENT_CLOSE_AFTER_REPLY and
|
|
* CLIENT_PROTOCOL_ERROR. */
|
|
#define PROTO_DUMP_LEN 128
|
|
static void setProtocolError(const char *errstr, client *c) {
|
|
if (server.verbosity <= LL_VERBOSE || isReplicatedClient(c)) {
|
|
sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log);
|
|
|
|
/* Sample some protocol to given an idea about what was inside. */
|
|
char buf[256];
|
|
buf[0] = '\0';
|
|
if (server.hide_user_data_from_log) {
|
|
snprintf(buf, sizeof(buf), "*redacted*");
|
|
} else {
|
|
if (c->querybuf && sdslen(c->querybuf) - c->qb_pos < PROTO_DUMP_LEN) {
|
|
snprintf(buf, sizeof(buf), "'%s'", c->querybuf + c->qb_pos);
|
|
} else if (c->querybuf) {
|
|
snprintf(buf, sizeof(buf), "'%.*s' (... more %zu bytes ...) '%.*s'",
|
|
PROTO_DUMP_LEN / 2, c->querybuf + c->qb_pos, sdslen(c->querybuf) - c->qb_pos - PROTO_DUMP_LEN,
|
|
PROTO_DUMP_LEN / 2, c->querybuf + sdslen(c->querybuf) - PROTO_DUMP_LEN / 2);
|
|
}
|
|
|
|
/* Remove non printable chars. */
|
|
char *p = buf;
|
|
while (*p != '\0') {
|
|
if (!isprint(*p)) *p = '.';
|
|
p++;
|
|
}
|
|
}
|
|
/* Log all the client and protocol info. */
|
|
int loglevel = (isReplicatedClient(c)) ? LL_WARNING : LL_VERBOSE;
|
|
serverLog(loglevel, "Protocol error (%s) from client: %s. Query buffer: %s", errstr, client, buf);
|
|
sdsfree(client);
|
|
}
|
|
c->flag.close_after_reply = 1;
|
|
c->flag.protocol_error = 1;
|
|
}
|
|
|
|
/* Process the query buffer for client 'c', setting up the client argument
|
|
* vector for command execution and parses additional commands into a queue.
|
|
* Sets the client's read_flags to indicate the parsing outcome.
|
|
*
|
|
* This function is called if processInputBuffer() detects that the next
|
|
* command is in RESP format, so the first byte in the command is found
|
|
* to be '*'. Otherwise for inline commands parseInlineBuffer() is called. */
|
|
void parseMultibulkBuffer(client *c) {
|
|
int flag = parseMultibulk(c, &c->argc, &c->argv, &c->argv_len,
|
|
&c->argv_len_sum, &c->net_input_bytes_curr_cmd);
|
|
c->read_flags |= flag;
|
|
|
|
if (c->read_flags & READ_FLAGS_AUTH_REQUIRED) {
|
|
/* Execute client's AUTH command before parsing more, because it affects
|
|
* parser limits for max allowed bulk and multibulk lengths. */
|
|
return;
|
|
}
|
|
|
|
if (isReplicatedClient(c)) {
|
|
/* TODO: some change is required for replication offset which is
|
|
* computed from c->qb_pos, assuming we only parse one command at a
|
|
* time. Disable multi-command parsing for replication for now. */
|
|
return;
|
|
}
|
|
|
|
/* Try parsing pipelined commands. */
|
|
cmdQueue *queue = &c->cmd_queue;
|
|
serverAssert(queue->len == 0);
|
|
while ((flag & READ_FLAGS_PARSING_COMPLETED) &&
|
|
sdslen(c->querybuf) > c->qb_pos &&
|
|
c->querybuf[c->qb_pos] == '*') {
|
|
c->reqtype = PROTO_REQ_MULTIBULK;
|
|
/* Push a new parser state to the command queue */
|
|
if (queue->len == queue->cap) {
|
|
if (queue->cap == 0) {
|
|
queue->cap = COMMAND_QUEUE_MIN_CAPACITY;
|
|
} else if (queue->cap <= 512) {
|
|
queue->cap *= 2;
|
|
} else {
|
|
break; /* Limit the length of the command queue. */
|
|
}
|
|
queue->cmds = zrealloc(queue->cmds, queue->cap * sizeof(parsedCommand));
|
|
}
|
|
parsedCommand *p = &queue->cmds[queue->len++];
|
|
memset(p, 0, sizeof(*p));
|
|
flag = parseMultibulk(c, &p->argc, &p->argv, &p->argv_len,
|
|
&p->argv_len_sum, &p->input_bytes);
|
|
p->read_flags = flag;
|
|
p->slot = -1;
|
|
}
|
|
}
|
|
|
|
/* Incremental parsing of a command in the client's query buffer.
|
|
*
|
|
* Parser state related to the input buffer are per client and stored in the
|
|
* client struct: querybuf, qb_len, multibulklen, bulklen, querybuf_peak.
|
|
*
|
|
* Parser state for the command structures is supplied using pointer arguments,
|
|
* which are also used for returning the parsed command or error: argv,
|
|
* argv_len, argc, read_flag.
|
|
*
|
|
* Returns a non-zero if parsing is complete (either error or success) and zero
|
|
* if the input buffer doesn't contain a enough data to parse a complete
|
|
* command. If non-zero is returned, the returned value is a read flag, either
|
|
* READ_FLAGS_PARSING_COMPLETED on success or one of the READ_FLAGS_ERROR_(...)
|
|
* values on parse error. */
|
|
static int parseMultibulk(client *c,
|
|
int *argc,
|
|
robj ***argv,
|
|
int *argv_len,
|
|
size_t *argv_len_sum,
|
|
unsigned long long *net_input_bytes_curr_cmd) {
|
|
char *newline = NULL;
|
|
int ok;
|
|
long long ll;
|
|
int is_replicated = c->read_flags & READ_FLAGS_REPLICATED;
|
|
int auth_required = c->read_flags & READ_FLAGS_AUTH_REQUIRED;
|
|
|
|
if (c->multibulklen == 0) {
|
|
/* The client (argc) should have been reset */
|
|
serverAssertWithInfo(c, NULL, *argc == 0);
|
|
|
|
/* Multi bulk length cannot be read without a \r\n */
|
|
newline = memchr(c->querybuf + c->qb_pos, '\r', sdslen(c->querybuf) - c->qb_pos);
|
|
if (newline == NULL) {
|
|
if (sdslen(c->querybuf) - c->qb_pos > PROTO_INLINE_MAX_SIZE) {
|
|
return READ_FLAGS_ERROR_BIG_MULTIBULK;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* Buffer should also contain \n */
|
|
if (newline - (c->querybuf + c->qb_pos) > (ssize_t)(sdslen(c->querybuf) - c->qb_pos - 2)) return 0;
|
|
|
|
/* Check that what follows \r is a real \n */
|
|
if (unlikely(newline[1] != '\n')) {
|
|
return READ_FLAGS_ERROR_INVALID_CRLF;
|
|
}
|
|
|
|
/* We know for sure there is a whole line since newline != NULL,
|
|
* so go ahead and find out the multi bulk length. */
|
|
serverAssertWithInfo(c, NULL, c->querybuf[c->qb_pos] == '*');
|
|
size_t multibulklen_slen = newline - (c->querybuf + 1 + c->qb_pos);
|
|
ok = string2ll(c->querybuf + 1 + c->qb_pos, multibulklen_slen, &ll);
|
|
if (!ok || ll > INT_MAX) {
|
|
return READ_FLAGS_ERROR_INVALID_MULTIBULK_LEN;
|
|
} else if (ll > 10 && auth_required) {
|
|
return READ_FLAGS_ERROR_UNAUTHENTICATED_MULTIBULK_LEN;
|
|
}
|
|
|
|
c->qb_pos = (newline - c->querybuf) + 2;
|
|
|
|
if (ll <= 0) {
|
|
return READ_FLAGS_PARSING_NEGATIVE_MBULK_LEN;
|
|
}
|
|
|
|
c->multibulklen = ll;
|
|
c->bulklen = -1;
|
|
|
|
/* Setup argv array */
|
|
if (*argv) zfree(*argv);
|
|
*argv_len = min(c->multibulklen, 1024);
|
|
*argv = zmalloc(sizeof(robj *) * *argv_len);
|
|
*argv_len_sum = 0;
|
|
|
|
/* Per-slot network bytes-in calculation.
|
|
*
|
|
* We calculate and store the current command's ingress bytes under
|
|
* c->net_input_bytes_curr_cmd, for which its per-slot aggregation is deferred
|
|
* until c->slot is parsed later within processCommand().
|
|
*
|
|
* Calculation: For multi bulk buffer, we accumulate four factors, namely;
|
|
*
|
|
* 1) multibulklen_slen + 3
|
|
* Cumulative string length (and not the value of) of multibulklen,
|
|
* including the first "*" byte and last "\r\n" 2 bytes from RESP.
|
|
* 2) bulklen_slen + 3
|
|
* Cumulative string length (and not the value of) of bulklen,
|
|
* including +3 from RESP first "$" byte and last "\r\n" 2 bytes per argument count.
|
|
* 3) c->argv_len_sum
|
|
* Cumulative string length of all argument vectors.
|
|
* 4) c->argc * 2
|
|
* Cumulative string length of the arguments' white-spaces, for which there exists a total of
|
|
* "\r\n" 2 bytes per argument.
|
|
*
|
|
* For example;
|
|
* Command) SET key value
|
|
* RESP) *3\r\n$3\r\nSET\r\n$3\r\nkey\r\n$5\r\nvalue\r\n
|
|
*
|
|
* 1) String length of "*3\r\n" is 4, obtained from (multibulklen_slen + 3).
|
|
* 2) String length of "$3\r\n" "$3\r\n" "$5\r\n" is 12, obtained from (bulklen_slen + 3).
|
|
* 3) String length of "SET" "key" "value" is 11, obtained from (c->argv_len_sum).
|
|
* 4) String length of the 3 arguments' white-spaces "\r\n" is 6, obtained from (c->argc * 2).
|
|
*
|
|
* The 1st component is calculated within the below line.
|
|
* */
|
|
*net_input_bytes_curr_cmd += (multibulklen_slen + 3);
|
|
}
|
|
|
|
serverAssertWithInfo(c, NULL, c->multibulklen > 0);
|
|
while (c->multibulklen) {
|
|
/* Read bulk length if unknown */
|
|
if (c->bulklen == -1) {
|
|
newline = memchr(c->querybuf + c->qb_pos, '\r', sdslen(c->querybuf) - c->qb_pos);
|
|
if (newline == NULL) {
|
|
if (sdslen(c->querybuf) - c->qb_pos > PROTO_INLINE_MAX_SIZE) {
|
|
return READ_FLAGS_ERROR_BIG_BULK_COUNT;
|
|
}
|
|
break;
|
|
}
|
|
|
|
/* Buffer should also contain \n */
|
|
if (newline - (c->querybuf + c->qb_pos) > (ssize_t)(sdslen(c->querybuf) - c->qb_pos - 2)) return 0;
|
|
|
|
if (c->querybuf[c->qb_pos] != '$') {
|
|
return READ_FLAGS_ERROR_MBULK_UNEXPECTED_CHARACTER;
|
|
}
|
|
|
|
/* Check that what follows \r is a real \n */
|
|
if (unlikely(newline[1] != '\n')) {
|
|
return READ_FLAGS_ERROR_INVALID_CRLF;
|
|
}
|
|
|
|
size_t bulklen_slen = newline - (c->querybuf + c->qb_pos + 1);
|
|
ok = string2ll(c->querybuf + c->qb_pos + 1, bulklen_slen, &ll);
|
|
if (!ok || ll < 0 || (!(is_replicated) && ll > server.proto_max_bulk_len)) {
|
|
return READ_FLAGS_ERROR_MBULK_INVALID_BULK_LEN;
|
|
} else if (ll > 16384 && auth_required) {
|
|
return READ_FLAGS_ERROR_UNAUTHENTICATED_BULK_LEN;
|
|
}
|
|
|
|
c->qb_pos = newline - c->querybuf + 2;
|
|
if (!(is_replicated) && ll >= PROTO_MBULK_BIG_ARG) {
|
|
/* When the client is not a replicated client (because replicated
|
|
* client's querybuf can only be trimmed after data applied
|
|
* and sent to replicas).
|
|
*
|
|
* If we are going to read a large object from network
|
|
* try to make it likely that it will start at c->querybuf
|
|
* boundary so that we can optimize object creation
|
|
* avoiding a large copy of data.
|
|
*
|
|
* But only when the data we have not parsed is less than
|
|
* or equal to ll+2. If the data length is greater than
|
|
* ll+2, trimming querybuf is just a waste of time, because
|
|
* at this time the querybuf contains not only our bulk. */
|
|
if (sdslen(c->querybuf) - c->qb_pos <= (size_t)ll + 2) {
|
|
if (c->querybuf == thread_shared_qb) {
|
|
/* Let the client take the ownership of the shared buffer. */
|
|
initSharedQueryBuf();
|
|
}
|
|
sdsrange(c->querybuf, c->qb_pos, -1);
|
|
c->qb_pos = 0;
|
|
/* Hint the sds library about the amount of bytes this string is
|
|
* going to contain. */
|
|
c->querybuf = sdsMakeRoomForNonGreedy(c->querybuf, ll + 2 - sdslen(c->querybuf));
|
|
/* We later set the peak to the used portion of the buffer, but here we over
|
|
* allocated because we know what we need, make sure it'll not be shrunk before used. */
|
|
if (c->querybuf_peak < (size_t)ll + 2) c->querybuf_peak = ll + 2;
|
|
}
|
|
}
|
|
c->bulklen = ll;
|
|
/* Per-slot network bytes-in calculation, 2nd component. */
|
|
*net_input_bytes_curr_cmd += (bulklen_slen + 3);
|
|
}
|
|
|
|
/* Read bulk argument */
|
|
if (sdslen(c->querybuf) - c->qb_pos < (size_t)(c->bulklen + 2)) {
|
|
/* Not enough data (+2 == trailing \r\n) */
|
|
break;
|
|
} else {
|
|
/* Check if we have space in argv, grow if needed */
|
|
if (*argc >= *argv_len) {
|
|
*argv_len = min(*argv_len < INT_MAX / 2 ? (*argv_len) * 2 : INT_MAX,
|
|
*argc + c->multibulklen);
|
|
*argv = zrealloc(*argv, sizeof(robj *) * (*argv_len));
|
|
}
|
|
|
|
/* Check that what follows argv is a real \r\n */
|
|
if (unlikely(c->querybuf[c->qb_pos + c->bulklen] != '\r' ||
|
|
c->querybuf[c->qb_pos + c->bulklen + 1] != '\n')) {
|
|
return READ_FLAGS_ERROR_INVALID_CRLF;
|
|
}
|
|
|
|
/* Optimization: if a non-replicated client's buffer contains JUST our bulk element
|
|
* instead of creating a new object by *copying* the sds we
|
|
* just use the current sds string. */
|
|
if (!is_replicated && c->qb_pos == 0 && c->bulklen >= PROTO_MBULK_BIG_ARG &&
|
|
sdslen(c->querybuf) == (size_t)(c->bulklen + 2)) {
|
|
(*argv)[(*argc)++] = createObject(OBJ_STRING, c->querybuf);
|
|
*argv_len_sum += c->bulklen;
|
|
sdsIncrLen(c->querybuf, -2); /* remove CRLF */
|
|
/* Assume that if we saw a fat argument we'll see another one
|
|
* likely... */
|
|
c->querybuf = sdsnewlen(SDS_NOINIT, c->bulklen + 2);
|
|
sdsclear(c->querybuf);
|
|
} else {
|
|
(*argv)[(*argc)++] = createStringObject(c->querybuf + c->qb_pos, c->bulklen);
|
|
*argv_len_sum += c->bulklen;
|
|
c->qb_pos += c->bulklen + 2;
|
|
}
|
|
c->bulklen = -1;
|
|
c->multibulklen--;
|
|
}
|
|
}
|
|
|
|
/* We're done when c->multibulklen == 0 */
|
|
if (c->multibulklen == 0) {
|
|
/* Per-slot network bytes-in calculation, 3rd and 4th components. */
|
|
*net_input_bytes_curr_cmd += (*argv_len_sum + (*argc * 2));
|
|
c->reqtype = 0;
|
|
return READ_FLAGS_PARSING_COMPLETED;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* Perform necessary tasks after a command was executed:
|
|
*
|
|
* 1. The client is reset unless there are reasons to avoid doing it.
|
|
* 2. In the case of primary clients, the replication offset is updated.
|
|
* 3. Propagate commands we got from our primary to replicas down the line. */
|
|
void commandProcessed(client *c) {
|
|
/* If client is blocked(including paused), just return avoid reset and replicate.
|
|
*
|
|
* 1. Don't reset the client structure for blocked clients, so that the reply
|
|
* callback will still be able to access the client argv and argc fields.
|
|
* The client will be reset in unblockClient().
|
|
* 2. Don't update replication offset or propagate commands to replicas,
|
|
* since we have not applied the command. */
|
|
if (c->flag.blocked) return;
|
|
|
|
reqresAppendResponse(c);
|
|
clusterSlotStatsAddNetworkBytesInForUserClient(c);
|
|
resetClient(c);
|
|
|
|
if (!c->repl_data) return;
|
|
|
|
long long prev_offset = c->repl_data->reploff;
|
|
if (isReplicatedClient(c) && !c->flag.multi) {
|
|
/* Update the applied replication offset of our primary. */
|
|
c->repl_data->reploff = c->repl_data->read_reploff - sdslen(c->querybuf) + c->qb_pos;
|
|
}
|
|
|
|
/* If the client is replicated we need to compute the difference
|
|
* between the applied offset before and after processing the buffer,
|
|
* to understand how much of the replication stream was actually
|
|
* applied to the state: this quantity, and its corresponding
|
|
* part of the replication stream, will be propagated to the
|
|
* sub-replicas and to the replication backlog. */
|
|
if (isReplicatedClient(c)) {
|
|
long long applied = c->repl_data->reploff - prev_offset;
|
|
if (applied) {
|
|
replicationFeedStreamFromPrimaryStream(c->querybuf + c->repl_data->repl_applied, applied);
|
|
c->repl_data->repl_applied += applied;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* This function calls processCommand(), but also performs a few sub tasks
|
|
* for the client that are useful in that context:
|
|
*
|
|
* 1. It sets the current client to the client 'c'.
|
|
* 2. calls commandProcessed() if the command was handled.
|
|
*
|
|
* The function returns C_ERR in case the client was freed as a side effect
|
|
* of processing the command, otherwise C_OK is returned. */
|
|
int processCommandAndResetClient(client *c) {
|
|
int deadclient = 0;
|
|
client *old_client = server.current_client;
|
|
server.current_client = c;
|
|
if (processCommand(c) == C_OK) {
|
|
commandProcessed(c);
|
|
/* Update the client's memory to include output buffer growth following the
|
|
* processed command. */
|
|
if (c->conn) updateClientMemUsageAndBucket(c);
|
|
}
|
|
|
|
if (server.current_client == NULL) deadclient = 1;
|
|
/*
|
|
* Restore the old client, this is needed because when a script
|
|
* times out, we will get into this code from processEventsWhileBlocked.
|
|
* Which will cause to set the server.current_client. If not restored
|
|
* we will return 1 to our caller which will falsely indicate the client
|
|
* is dead and will stop reading from its buffer.
|
|
*/
|
|
server.current_client = old_client;
|
|
/* performEvictions may flush replica output buffers. This may
|
|
* result in a replica, that may be the active client, to be
|
|
* freed. */
|
|
return deadclient ? C_ERR : C_OK;
|
|
}
|
|
|
|
|
|
/* This function will execute any fully parsed commands pending on
|
|
* the client. Returns C_ERR if the client is no longer valid after executing
|
|
* the command, and C_OK for all other cases. */
|
|
int processPendingCommandAndInputBuffer(client *c) {
|
|
/* Notice, this code is also called from 'processUnblockedClients'.
|
|
* But in case of a module blocked client (see RM_Call 'K' flag) we do not reach this code path.
|
|
* So whenever we change the code here we need to consider if we need this change on module
|
|
* blocked client as well */
|
|
if (c->flag.pending_command) {
|
|
c->flag.pending_command = 0;
|
|
if (processCommandAndResetClient(c) == C_ERR) {
|
|
return C_ERR;
|
|
}
|
|
}
|
|
|
|
/* Now process client if it has more commands queued and/or more data in
|
|
* it's buffer.
|
|
*
|
|
* Note: when a primary client steps into this function,
|
|
* it can always satisfy this condition, because its querybuf
|
|
* contains data not applied. */
|
|
if ((c->querybuf && sdslen(c->querybuf) > 0) || c->cmd_queue.off < c->cmd_queue.len) {
|
|
return processInputBuffer(c);
|
|
}
|
|
return C_OK;
|
|
}
|
|
|
|
/* Parse one or more commands from the query buf.
|
|
*
|
|
* This function may be called from the main thread or from the I/O thread.
|
|
*
|
|
* Sets the client's read_flags to indicate the parsing outcome. If multiple
|
|
* commands could be parsed, additional parsed commands are stored in the
|
|
* client's command queue. */
|
|
void parseInputBuffer(client *c) {
|
|
/* The command queue must be emptied before parsing. */
|
|
serverAssert(c->cmd_queue.len == 0);
|
|
|
|
/* Determine request type when unknown. */
|
|
if (!c->reqtype) {
|
|
if (c->querybuf[c->qb_pos] == '*') {
|
|
c->reqtype = PROTO_REQ_MULTIBULK;
|
|
} else {
|
|
c->reqtype = PROTO_REQ_INLINE;
|
|
}
|
|
}
|
|
|
|
if (c->reqtype == PROTO_REQ_INLINE) {
|
|
parseInlineBuffer(c);
|
|
} else if (c->reqtype == PROTO_REQ_MULTIBULK) {
|
|
parseMultibulkBuffer(c);
|
|
} else {
|
|
serverPanic("Unknown request type");
|
|
}
|
|
}
|
|
|
|
/* Free unused memory in a client's queue of parsed commands. */
|
|
void trimCommandQueue(client *c) {
|
|
if (c->flag.close_asap) return; /* Prevent concurrent access with
|
|
freeClientAsync(). */
|
|
cmdQueue *queue = &c->cmd_queue;
|
|
if (queue->cmds != NULL) {
|
|
if (queue->len == 0) {
|
|
zfree(queue->cmds);
|
|
queue->cmds = NULL;
|
|
queue->cap = 0;
|
|
} else {
|
|
/* Try shrink to the next power of two >= len */
|
|
const int bits = CHAR_BIT * sizeof(unsigned int);
|
|
uint16_t cap = queue->len == 1 ? 1 : 1 << (bits - __builtin_clz(queue->len - 1));
|
|
serverAssert(cap >= queue->len);
|
|
cap = max(cap, COMMAND_QUEUE_MIN_CAPACITY);
|
|
if (cap < queue->cap) {
|
|
queue->cap = cap;
|
|
queue->cmds = zrealloc(queue->cmds, cap * sizeof(parsedCommand));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
int canParseCommand(client *c) {
|
|
if (c->cmd != NULL) return 0;
|
|
|
|
/* Don't parse a command if the client is in the middle of something. */
|
|
if (c->flag.blocked || c->flag.unblocked) return 0;
|
|
|
|
/* Don't process more buffers from clients that have already pending
|
|
* commands to execute in c->argv. */
|
|
if (c->flag.pending_command) return 0;
|
|
|
|
/* Don't process input from replicated clients while there is a busy script
|
|
* condition on this node. We want just to accumulate the replication
|
|
* stream (instead of replying -BUSY like we do with other clients) and
|
|
* later resume the processing. */
|
|
if (isInsideYieldingLongCommand() && isReplicatedClient(c)) return 0;
|
|
|
|
/* CLIENT_CLOSE_AFTER_REPLY closes the connection once the reply is
|
|
* written to the client. Make sure to not let the reply grow after
|
|
* this flag has been set (i.e. don't process more commands).
|
|
*
|
|
* The same applies for clients we want to terminate ASAP. */
|
|
if (c->flag.close_after_reply || c->flag.close_asap) return 0;
|
|
|
|
return 1;
|
|
}
|
|
|
|
/* Pops a command from the command queue and sets it as the client's current
|
|
* command. Returns true on success and false if the queue was empty. */
|
|
static bool consumeCommandQueue(client *c) {
|
|
cmdQueue *queue = &c->cmd_queue;
|
|
if (queue->off >= queue->len) return false;
|
|
parsedCommand *p = &queue->cmds[queue->off++];
|
|
/* Combine the command's read flags with the client's read flags. Some read
|
|
* flags describe the client state (AUTH_REQUIRED) while others describe the
|
|
* command parsing outcome (PARSING_COMPLETED). */
|
|
c->read_flags |= p->read_flags;
|
|
c->argc = p->argc;
|
|
c->argv = p->argv;
|
|
c->argv_len = p->argv_len;
|
|
c->argv_len_sum = p->argv_len_sum;
|
|
c->net_input_bytes_curr_cmd = p->input_bytes;
|
|
c->parsed_cmd = p->cmd;
|
|
c->slot = p->slot;
|
|
if (queue->off == queue->len) {
|
|
/* The queue is empty. Don't free it here, because if parsing is done in
|
|
* I/O threads, we want to free it in I/O threads too, to avoid
|
|
* fragmentation. */
|
|
queue->off = queue->len = 0;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
void discardCommandQueue(client *c) {
|
|
cmdQueue *queue = &c->cmd_queue;
|
|
while (queue->off < queue->len) {
|
|
parsedCommand *p = &queue->cmds[queue->off++];
|
|
for (int j = 0; j < p->argc; j++) {
|
|
decrRefCount(p->argv[j]);
|
|
}
|
|
zfree(p->argv);
|
|
}
|
|
zfree(queue->cmds);
|
|
queue->cmds = NULL;
|
|
queue->off = queue->len = queue->cap = 0;
|
|
}
|
|
|
|
/* Returns the number of keys in the the incr_states array after adding keys. */
|
|
static int addKeysToIncrFindBatch(client *c,
|
|
struct serverCommand *cmd,
|
|
robj **argv,
|
|
int argc,
|
|
hashtableIncrementalFindState *incr_states,
|
|
int num,
|
|
int max) {
|
|
getKeysResult result;
|
|
initGetKeysResult(&result);
|
|
int numkeys = getKeysFromCommand(cmd, argv, argc, &result);
|
|
if (numkeys) {
|
|
int kvstore_idx = 0;
|
|
if (server.cluster_enabled) {
|
|
robj *first_key = argv[result.keys[0].pos];
|
|
kvstore_idx = keyHashSlot(first_key->ptr, sdslen(first_key->ptr));
|
|
}
|
|
hashtable *ht = kvstoreGetHashtable(c->db->keys, kvstore_idx);
|
|
if (ht != NULL) {
|
|
for (int i = 0; i < numkeys && num < max; i++) {
|
|
hashtableIncrementalFindState *incr_state = &incr_states[num++];
|
|
robj *keyobj = argv[result.keys[i].pos];
|
|
hashtableIncrementalFindInit(incr_state, ht, keyobj->ptr);
|
|
}
|
|
}
|
|
}
|
|
getKeysFreeResult(&result);
|
|
return num;
|
|
}
|
|
|
|
/* Prefetches the keys for the commands queued up in the client.
|
|
*
|
|
* TODO: Avoid the logic duplicated with the code in memory_prefetch.c which
|
|
* is used with I/O threading. */
|
|
static void prefetchCommandQueueKeys(client *c) {
|
|
if (c->read_flags & READ_FLAGS_PREFETCHED) return;
|
|
c->read_flags |= READ_FLAGS_PREFETCHED;
|
|
|
|
/* Prefetching states */
|
|
const int max_keys = server.prefetch_batch_max_size;
|
|
int num_keys = 0;
|
|
hashtableIncrementalFindState key_incr_states[max_keys];
|
|
if (max_keys <= 1) return; /* No point to prefetch a single key */
|
|
|
|
/* If the command is valid, add keys to incremental find batch. */
|
|
if (c->parsed_cmd != NULL && !(c->read_flags & READ_FLAGS_BAD_ARITY)) {
|
|
num_keys = addKeysToIncrFindBatch(c, c->parsed_cmd, c->argv, c->argc,
|
|
key_incr_states, num_keys, max_keys);
|
|
} else {
|
|
/* Command is already found to be incomplete, non-existing, etc. */
|
|
debugServerAssert(!(c->read_flags & READ_FLAGS_PARSING_COMPLETED) ||
|
|
c->argc == 0 ||
|
|
(c->read_flags & READ_FLAGS_COMMAND_NOT_FOUND) ||
|
|
(c->read_flags & READ_FLAGS_BAD_ARITY));
|
|
}
|
|
|
|
cmdQueue *queue = &c->cmd_queue;
|
|
for (int i = queue->off; i < queue->len; i++) {
|
|
if (num_keys >= max_keys) break;
|
|
parsedCommand *p = &queue->cmds[i];
|
|
p->read_flags |= READ_FLAGS_PREFETCHED;
|
|
if (p->cmd == NULL || p->read_flags & READ_FLAGS_BAD_ARITY) {
|
|
/* Command is already found to be incomplete, non-existing, etc. */
|
|
debugServerAssert(!(p->read_flags & READ_FLAGS_PARSING_COMPLETED) ||
|
|
p->argc == 0 ||
|
|
(p->read_flags & READ_FLAGS_COMMAND_NOT_FOUND) ||
|
|
(p->read_flags & READ_FLAGS_BAD_ARITY));
|
|
continue;
|
|
}
|
|
num_keys = addKeysToIncrFindBatch(c, p->cmd, p->argv, p->argc,
|
|
key_incr_states, num_keys, max_keys);
|
|
}
|
|
if (num_keys <= 1) return; /* No point to prefetch a single key */
|
|
|
|
/* Batch-lookup the keys. */
|
|
int not_complete_count;
|
|
do {
|
|
not_complete_count = 0;
|
|
for (int i = 0; i < num_keys; i++) {
|
|
not_complete_count += hashtableIncrementalFindStep(&key_incr_states[i]);
|
|
}
|
|
} while (not_complete_count != 0);
|
|
|
|
/* Prefetch value pointers. */
|
|
for (int i = 0; i < num_keys; i++) {
|
|
void *entry;
|
|
if (hashtableIncrementalFindGetResult(&key_incr_states[i], &entry)) {
|
|
robj *val = entry;
|
|
/* TODO? Prefetch all types and encodings except OBJ_ENCODING_EMBSTR
|
|
* and OBJ_ENCODING_INT. */
|
|
if (val->encoding == OBJ_ENCODING_RAW && val->type == OBJ_STRING) {
|
|
valkey_prefetch(val->ptr);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
int processInputBuffer(client *c) {
|
|
/* Parse the query buffer and/or execute already parsed commands. */
|
|
while ((c->querybuf && c->qb_pos < sdslen(c->querybuf)) ||
|
|
c->cmd_queue.off < c->cmd_queue.len) {
|
|
if (!canParseCommand(c)) {
|
|
break;
|
|
}
|
|
|
|
c->read_flags = isReplicatedClient(c) ? READ_FLAGS_REPLICATED : 0;
|
|
c->read_flags |= authRequired(c) ? READ_FLAGS_AUTH_REQUIRED : 0;
|
|
|
|
/* If commands are queued up, pop from the queue first */
|
|
if (!consumeCommandQueue(c)) {
|
|
parseInputBuffer(c);
|
|
prepareCommandQueue(c);
|
|
}
|
|
|
|
/* Prefetch keys for the next commands in queue, if not already done. */
|
|
prefetchCommandQueueKeys(c);
|
|
|
|
if (handleParseResults(c) != PARSE_OK) {
|
|
break;
|
|
}
|
|
|
|
if (c->argc == 0) {
|
|
/* No command to process - continue parsing the query buf. */
|
|
continue;
|
|
}
|
|
|
|
if (c->querybuf == thread_shared_qb) {
|
|
/* Before processing the command, reset the shared query buffer to its default state.
|
|
* This avoids unintentionally modifying the shared qb during processCommand as we may use
|
|
* the shared qb for other clients during processEventsWhileBlocked */
|
|
resetSharedQueryBuf(c);
|
|
}
|
|
|
|
/* We are finally ready to execute the command. */
|
|
if (processCommandAndResetClient(c) == C_ERR) {
|
|
/* If the client is no longer valid, we avoid exiting this
|
|
* loop and trimming the client buffer later. So we return
|
|
* ASAP in that case. */
|
|
return C_ERR;
|
|
}
|
|
}
|
|
|
|
return C_OK;
|
|
}
|
|
|
|
/* This function can be called from the main-thread or from the IO-thread.
|
|
* The function allocates query-buf for the client if required and reads to it from the network.
|
|
* It will set c->nread to the bytes read from the network.
|
|
* Returns true if the buffer was filled (more data may be available). */
|
|
static bool readToQueryBuf(client *c) {
|
|
int big_arg = 0;
|
|
size_t qblen, readlen;
|
|
|
|
/* If the replica RDB client is marked as closed ASAP, do not try to read from it */
|
|
if (c->flag.close_asap) return false;
|
|
|
|
int is_replicated = c->read_flags & READ_FLAGS_REPLICATED;
|
|
|
|
readlen = PROTO_IOBUF_LEN;
|
|
qblen = c->querybuf ? sdslen(c->querybuf) : 0;
|
|
/* If this is a multi bulk request, and we are processing a bulk reply
|
|
* that is large enough, try to maximize the probability that the query
|
|
* buffer contains exactly the SDS string representing the object, even
|
|
* at the risk of requiring more read(2) calls. This way the function
|
|
* parseMultibulkBuffer() can avoid copying buffers to create the
|
|
* robj representing the argument. */
|
|
|
|
if (c->reqtype == PROTO_REQ_MULTIBULK && c->multibulklen && c->bulklen != -1 && c->bulklen >= PROTO_MBULK_BIG_ARG) {
|
|
ssize_t remaining = (size_t)(c->bulklen + 2) - (qblen - c->qb_pos);
|
|
big_arg = 1;
|
|
|
|
/* Note that the 'remaining' variable may be zero in some edge case,
|
|
* for example once we resume a blocked client after CLIENT PAUSE. */
|
|
if (remaining > 0) readlen = remaining;
|
|
|
|
/* Replicated client needs expand the readlen when meet BIG_ARG(see #9100),
|
|
* but doesn't need align to the next arg, we can read more data. */
|
|
if (isReplicatedClient(c) && readlen < PROTO_IOBUF_LEN) readlen = PROTO_IOBUF_LEN;
|
|
}
|
|
|
|
if (c->querybuf == NULL) {
|
|
serverAssert(sdslen(thread_shared_qb) == 0);
|
|
c->querybuf = big_arg ? sdsempty() : thread_shared_qb;
|
|
qblen = sdslen(c->querybuf);
|
|
}
|
|
|
|
/* c->querybuf may be expanded. If so, the old thread_shared_qb will be released.
|
|
* Although we have ensured that c->querybuf will not be expanded in the current
|
|
* thread_shared_qb, we still add this check for code robustness. */
|
|
int use_thread_shared_qb = (c->querybuf == thread_shared_qb) ? 1 : 0;
|
|
if (!is_replicated && // replicated clients' querybuf can grow greedy.
|
|
(big_arg || sdsalloc(c->querybuf) < PROTO_IOBUF_LEN)) {
|
|
/* When reading a BIG_ARG we won't be reading more than that one arg
|
|
* into the query buffer, so we don't need to pre-allocate more than we
|
|
* need, so using the non-greedy growing. For an initial allocation of
|
|
* the query buffer, we also don't wanna use the greedy growth, in order
|
|
* to avoid collision with the RESIZE_THRESHOLD mechanism. */
|
|
c->querybuf = sdsMakeRoomForNonGreedy(c->querybuf, readlen);
|
|
/* We later set the peak to the used portion of the buffer, but here we over
|
|
* allocated because we know what we need, make sure it'll not be shrunk before used. */
|
|
if (c->querybuf_peak < qblen + readlen) c->querybuf_peak = qblen + readlen;
|
|
} else {
|
|
c->querybuf = sdsMakeRoomFor(c->querybuf, readlen);
|
|
|
|
/* Read as much as possible from the socket to save read(2) system calls. */
|
|
readlen = sdsavail(c->querybuf);
|
|
}
|
|
if (use_thread_shared_qb) serverAssert(c->querybuf == thread_shared_qb);
|
|
|
|
c->nread = connRead(c->conn, c->querybuf + qblen, readlen);
|
|
if (c->nread <= 0) {
|
|
return false;
|
|
}
|
|
|
|
sdsIncrLen(c->querybuf, c->nread);
|
|
qblen = sdslen(c->querybuf);
|
|
if (c->querybuf_peak < qblen) c->querybuf_peak = qblen;
|
|
if (!is_replicated) {
|
|
/* The commands cached in the MULTI/EXEC queue have not been executed yet,
|
|
* so they are also considered a part of the query buffer in a broader sense.
|
|
*
|
|
* For unauthenticated clients, the query buffer cannot exceed 1MB at most. */
|
|
size_t qb_memory = sdslen(c->querybuf) + (c->mstate ? c->mstate->argv_len_sums : 0);
|
|
if (qb_memory > server.client_max_querybuf_len ||
|
|
(qb_memory > 1024 * 1024 && (c->read_flags & READ_FLAGS_AUTH_REQUIRED))) {
|
|
c->read_flags |= READ_FLAGS_QB_LIMIT_REACHED;
|
|
}
|
|
}
|
|
return (size_t)c->nread == readlen;
|
|
}
|
|
|
|
#define REPL_MAX_READS_PER_IO_EVENT 25
|
|
void readQueryFromClient(connection *conn) {
|
|
client *c = connGetPrivateData(conn);
|
|
/* Check if we can send the client to be handled by the IO-thread */
|
|
if (postponeClientRead(c)) return;
|
|
|
|
if (c->io_write_state != CLIENT_IDLE || c->io_read_state != CLIENT_IDLE) return;
|
|
|
|
bool repeat = false;
|
|
int iter = 0;
|
|
do {
|
|
bool full_read = readToQueryBuf(c);
|
|
if (handleReadResult(c) == C_OK) {
|
|
if (processInputBuffer(c) == C_ERR) return;
|
|
trimCommandQueue(c);
|
|
}
|
|
repeat = (c->flag.primary &&
|
|
!c->flag.close_asap &&
|
|
++iter < REPL_MAX_READS_PER_IO_EVENT &&
|
|
full_read);
|
|
beforeNextClient(c);
|
|
} while (repeat);
|
|
}
|
|
|
|
/* An "Address String" is a colon separated ip:port pair.
|
|
* For IPv4 it's in the form x.y.z.k:port, example: "127.0.0.1:1234".
|
|
* For IPv6 addresses we use [] around the IP part, like in "[::1]:1234".
|
|
* For Unix sockets we use path:0, like in "/tmp/valkey:0".
|
|
*
|
|
* An Address String always fits inside a buffer of CONN_ADDR_STR_LEN bytes,
|
|
* including the null term.
|
|
*
|
|
* On failure the function still populates 'addr' with the "?:0" string in case
|
|
* you want to relax error checking or need to display something anyway (see
|
|
* anetFdToString implementation for more info). */
|
|
void genClientAddrString(client *client, char *addr, size_t addr_len, int remote) {
|
|
connFormatAddr(client->conn, addr, addr_len, remote);
|
|
}
|
|
|
|
/* This function returns the client peer id, by creating and caching it
|
|
* if client->peerid is NULL, otherwise returning the cached value.
|
|
* The Peer ID never changes during the life of the client, however it
|
|
* is expensive to compute. */
|
|
char *getClientPeerId(client *c) {
|
|
char peerid[CONN_ADDR_STR_LEN] = {0};
|
|
|
|
if (c->peerid == NULL) {
|
|
genClientAddrString(c, peerid, sizeof(peerid), 1);
|
|
c->peerid = sdsnew(peerid);
|
|
}
|
|
return c->peerid;
|
|
}
|
|
|
|
/* This function returns the client bound socket name, by creating and caching
|
|
* it if client->sockname is NULL, otherwise returning the cached value.
|
|
* The Socket Name never changes during the life of the client, however it
|
|
* is expensive to compute. */
|
|
char *getClientSockname(client *c) {
|
|
char sockname[CONN_ADDR_STR_LEN] = {0};
|
|
|
|
if (c->sockname == NULL) {
|
|
genClientAddrString(c, sockname, sizeof(sockname), 0);
|
|
c->sockname = sdsnew(sockname);
|
|
}
|
|
return c->sockname;
|
|
}
|
|
|
|
int isClientConnIpV6(client *c) {
|
|
/* The cached client peer id is on the form "[IPv6]:port" for IPv6
|
|
* addresses, so we just check for '[' here. */
|
|
if (c->flag.fake && server.current_client) {
|
|
/* Fake client? Use current client instead, if we have one. */
|
|
c = server.current_client;
|
|
}
|
|
|
|
if (c->flag.fake || !c->conn) {
|
|
/* If we still don't have a client with a real connection (e.g., called
|
|
* from module timer with no real current client), default to IPv4 to
|
|
* avoid crashing. */
|
|
return 0;
|
|
}
|
|
|
|
return getClientPeerId(c)[0] == '[';
|
|
}
|
|
|
|
/* Concatenate a string representing the state of a client in a human
|
|
* readable format, into the sds string 's'. */
|
|
sds catClientInfoString(sds s, client *client, int hide_user_data) {
|
|
if (!server.crashed) waitForClientIO(client);
|
|
char flags[17], events[3], capa[9], conninfo[CONN_INFO_LEN], *p;
|
|
|
|
p = flags;
|
|
if (client->flag.replica) {
|
|
if (client->flag.monitor)
|
|
*p++ = 'O';
|
|
else
|
|
*p++ = 'S';
|
|
}
|
|
|
|
if (client->flag.primary) *p++ = 'M';
|
|
if (client->flag.pubsub) *p++ = 'P';
|
|
if (client->flag.multi) *p++ = 'x';
|
|
if (client->flag.blocked) *p++ = 'b';
|
|
if (client->flag.tracking) *p++ = 't';
|
|
if (client->flag.tracking_broken_redir) *p++ = 'R';
|
|
if (client->flag.tracking_bcast) *p++ = 'B';
|
|
if (client->flag.dirty_cas) *p++ = 'd';
|
|
if (client->flag.close_after_reply) *p++ = 'c';
|
|
if (client->flag.unblocked) *p++ = 'u';
|
|
if (client->flag.close_asap) *p++ = 'A';
|
|
if (client->flag.unix_socket) *p++ = 'U';
|
|
if (client->flag.readonly) *p++ = 'r';
|
|
if (client->flag.no_evict) *p++ = 'e';
|
|
if (client->flag.no_touch) *p++ = 'T';
|
|
if (client->flag.import_source) *p++ = 'I';
|
|
if (client->slot_migration_job && isImportSlotMigrationJob(client->slot_migration_job)) *p++ = 'i';
|
|
if (client->slot_migration_job && !isImportSlotMigrationJob(client->slot_migration_job)) *p++ = 'E';
|
|
if (p == flags) *p++ = 'N';
|
|
*p++ = '\0';
|
|
|
|
p = events;
|
|
if (client->conn) {
|
|
if (connHasReadHandler(client->conn)) *p++ = 'r';
|
|
if (connHasWriteHandler(client->conn)) *p++ = 'w';
|
|
}
|
|
*p = '\0';
|
|
|
|
p = capa;
|
|
if (client->capa & CLIENT_CAPA_REDIRECT) *p++ = 'r';
|
|
*p = '\0';
|
|
|
|
/* Compute the total memory consumed by this client. */
|
|
size_t obufmem, total_mem = getClientMemoryUsage(client, &obufmem);
|
|
|
|
size_t used_blocks_of_repl_buf = 0;
|
|
if (client->repl_data && client->repl_data->ref_repl_buf_node) {
|
|
replBufBlock *last = listNodeValue(listLast(server.repl_buffer_blocks));
|
|
replBufBlock *cur = listNodeValue(client->repl_data->ref_repl_buf_node);
|
|
used_blocks_of_repl_buf = last->id - cur->id + 1;
|
|
}
|
|
sds ret = sdscatfmt(
|
|
s,
|
|
FMTARGS(
|
|
"id=%U", (unsigned long long)client->id,
|
|
" addr=%s", getClientPeerId(client),
|
|
" laddr=%s", getClientSockname(client),
|
|
" %s", connGetInfo(client->conn, conninfo, sizeof(conninfo)),
|
|
" name=%s", hide_user_data ? "*redacted*" : (client->name ? (char *)client->name->ptr : ""),
|
|
" age=%I", (long long)(commandTimeSnapshot() / 1000 - client->ctime),
|
|
" idle=%I", (long long)(server.unixtime - client->last_interaction),
|
|
" flags=%s", flags,
|
|
" capa=%s", capa,
|
|
" db=%i", client->db->id,
|
|
" sub=%i", client->pubsub_data ? (int)hashtableSize(client->pubsub_data->pubsub_channels) : 0,
|
|
" psub=%i", client->pubsub_data ? (int)hashtableSize(client->pubsub_data->pubsub_patterns) : 0,
|
|
" ssub=%i", client->pubsub_data ? (int)hashtableSize(client->pubsub_data->pubsubshard_channels) : 0,
|
|
" multi=%i", client->mstate ? client->mstate->count : -1,
|
|
" watch=%i", client->mstate ? (int)listLength(&client->mstate->watched_keys) : 0,
|
|
" qbuf=%U", client->querybuf ? (unsigned long long)sdslen(client->querybuf) : 0,
|
|
" qbuf-free=%U", client->querybuf ? (unsigned long long)sdsavail(client->querybuf) : 0,
|
|
" argv-mem=%U", (unsigned long long)client->argv_len_sum,
|
|
" multi-mem=%U", client->mstate ? (unsigned long long)client->mstate->argv_len_sums : 0,
|
|
" rbs=%U", (unsigned long long)client->buf_usable_size,
|
|
" rbp=%U", (unsigned long long)client->buf_peak,
|
|
" obl=%U", (unsigned long long)client->bufpos,
|
|
" oll=%U", (unsigned long long)listLength(client->reply) + used_blocks_of_repl_buf,
|
|
" omem=%U", (unsigned long long)obufmem, /* should not include client->buf since we want to see 0 for static clients. */
|
|
" tot-mem=%U", (unsigned long long)total_mem,
|
|
" events=%s", events,
|
|
" cmd=%s", client->lastcmd ? client->lastcmd->fullname : "NULL",
|
|
" user=%s", hide_user_data ? "*redacted*" : (client->user ? client->user->name : "(superuser)"),
|
|
" redir=%I", (client->flag.tracking) ? (long long)client->pubsub_data->client_tracking_redirection : -1,
|
|
" resp=%i", client->resp,
|
|
" lib-name=%s", client->lib_name ? (char *)client->lib_name->ptr : "",
|
|
" lib-ver=%s", client->lib_ver ? (char *)client->lib_ver->ptr : "",
|
|
" tot-net-in=%U", client->net_input_bytes,
|
|
" tot-net-out=%U", client->net_output_bytes,
|
|
" tot-cmds=%U", client->commands_processed));
|
|
return ret;
|
|
}
|
|
|
|
/* Concatenate a string representing the state of a client in a human
|
|
* readable format, into the sds string 's'.
|
|
*
|
|
* This is a simplified and shortened version of catClientInfoString,
|
|
* it only added some basic fields for tracking clients. */
|
|
sds catClientInfoShortString(sds s, client *client, int hide_user_data) {
|
|
if (!server.crashed) waitForClientIO(client);
|
|
char conninfo[CONN_INFO_LEN];
|
|
|
|
sds ret = sdscatfmt(
|
|
s,
|
|
FMTARGS(
|
|
"id=%U", (unsigned long long)client->id,
|
|
" addr=%s", getClientPeerId(client),
|
|
" laddr=%s", getClientSockname(client),
|
|
" %s", connGetInfo(client->conn, conninfo, sizeof(conninfo)),
|
|
" name=%s", hide_user_data ? "*redacted*" : (client->name ? (char *)client->name->ptr : ""),
|
|
" user=%s", hide_user_data ? "*redacted*" : (client->user ? client->user->name : "(superuser)"),
|
|
" lib-name=%s", client->lib_name ? (char *)client->lib_name->ptr : "",
|
|
" lib-ver=%s", client->lib_ver ? (char *)client->lib_ver->ptr : ""));
|
|
return ret;
|
|
}
|
|
|
|
sds getAllClientsInfoString(int type, int hide_user_data) {
|
|
listNode *ln;
|
|
listIter li;
|
|
client *client;
|
|
sds o = sdsnewlen(SDS_NOINIT, 200 * listLength(server.clients));
|
|
sdsclear(o);
|
|
listRewind(server.clients, &li);
|
|
while ((ln = listNext(&li)) != NULL) {
|
|
client = listNodeValue(ln);
|
|
if (type != -1 && getClientType(client) != type) continue;
|
|
o = catClientInfoString(o, client, hide_user_data);
|
|
o = sdscatlen(o, "\n", 1);
|
|
}
|
|
return o;
|
|
}
|
|
|
|
static sds getAllFilteredClientsInfoString(clientFilter *client_filter, int hide_user_data) {
|
|
listNode *ln;
|
|
listIter li;
|
|
client *client;
|
|
sds o = sdsempty();
|
|
sdsclear(o);
|
|
listRewind(server.clients, &li);
|
|
while ((ln = listNext(&li)) != NULL) {
|
|
client = listNodeValue(ln);
|
|
if (!clientMatchesFilter(client, client_filter)) continue;
|
|
o = catClientInfoString(o, client, hide_user_data);
|
|
o = sdscatlen(o, "\n", 1);
|
|
}
|
|
return o;
|
|
}
|
|
|
|
/* Check validity of an attribute that's gonna be shown in CLIENT LIST. */
|
|
int validateClientAttr(const char *val) {
|
|
/* Check if the charset is ok. We need to do this otherwise
|
|
* CLIENT LIST format will break. You should always be able to
|
|
* split by space to get the different fields. */
|
|
while (*val) {
|
|
if (*val < '!' || *val > '~') { /* ASCII is assumed. */
|
|
return C_ERR;
|
|
}
|
|
val++;
|
|
}
|
|
return C_OK;
|
|
}
|
|
|
|
/* Returns C_OK if the name is valid. Returns C_ERR & sets `err` (when provided) otherwise. */
|
|
int validateClientName(robj *name, const char **err) {
|
|
const char *err_msg = "Client names cannot contain spaces, newlines or special characters.";
|
|
int len = (name != NULL) ? sdslen(name->ptr) : 0;
|
|
/* We allow setting the client name to an empty string. */
|
|
if (len == 0) return C_OK;
|
|
if (validateClientAttr(name->ptr) == C_ERR) {
|
|
if (err) *err = err_msg;
|
|
return C_ERR;
|
|
}
|
|
return C_OK;
|
|
}
|
|
|
|
/* Returns C_OK if the name has been set or C_ERR if the name is invalid. */
|
|
int clientSetName(client *c, robj *name, const char **err) {
|
|
if (validateClientName(name, err) == C_ERR) {
|
|
return C_ERR;
|
|
}
|
|
int len = (name != NULL) ? sdslen(name->ptr) : 0;
|
|
/* Setting the client name to an empty string actually removes
|
|
* the current name. */
|
|
if (len == 0) {
|
|
if (c->name) decrRefCount(c->name);
|
|
c->name = NULL;
|
|
return C_OK;
|
|
}
|
|
if (c->name) decrRefCount(c->name);
|
|
c->name = name;
|
|
incrRefCount(name);
|
|
return C_OK;
|
|
}
|
|
|
|
/* This function implements CLIENT SETNAME, including replying to the
|
|
* user with an error if the charset is wrong (in that case C_ERR is
|
|
* returned). If the function succeeded C_OK is returned, and it's up
|
|
* to the caller to send a reply if needed.
|
|
*
|
|
* Setting an empty string as name has the effect of unsetting the
|
|
* currently set name: the client will remain unnamed.
|
|
*
|
|
* This function is also used to implement the HELLO SETNAME option. */
|
|
int clientSetNameOrReply(client *c, robj *name) {
|
|
const char *err = NULL;
|
|
int result = clientSetName(c, name, &err);
|
|
if (result == C_ERR) {
|
|
addReplyError(c, err);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/* Set client or connection related info */
|
|
void clientSetinfoCommand(client *c) {
|
|
sds attr = c->argv[2]->ptr;
|
|
robj *valob = c->argv[3];
|
|
sds val = valob->ptr;
|
|
robj **destvar = NULL;
|
|
if (!strcasecmp(attr, "lib-name")) {
|
|
destvar = &c->lib_name;
|
|
} else if (!strcasecmp(attr, "lib-ver")) {
|
|
destvar = &c->lib_ver;
|
|
} else {
|
|
addReplyErrorFormat(c, "Unrecognized option '%s'", attr);
|
|
return;
|
|
}
|
|
|
|
if (validateClientAttr(val) == C_ERR) {
|
|
addReplyErrorFormat(c, "%s cannot contain spaces, newlines or special characters.", attr);
|
|
return;
|
|
}
|
|
if (*destvar) decrRefCount(*destvar);
|
|
if (sdslen(val)) {
|
|
*destvar = valob;
|
|
incrRefCount(valob);
|
|
} else
|
|
*destvar = NULL;
|
|
addReply(c, shared.ok);
|
|
}
|
|
|
|
/* Reset the client state to resemble a newly connected client.
|
|
*/
|
|
void resetCommand(client *c) {
|
|
/* MONITOR clients are also marked with CLIENT_REPLICA, we need to
|
|
* distinguish between the two.
|
|
*/
|
|
struct ClientFlags flags = c->flag;
|
|
if (flags.monitor) {
|
|
flags.monitor = 0;
|
|
flags.replica = 0;
|
|
}
|
|
|
|
if (flags.replica || flags.primary || flags.module) {
|
|
addReplyError(c, "can only reset normal client connections");
|
|
return;
|
|
}
|
|
|
|
clearClientConnectionState(c);
|
|
addReplyStatus(c, "RESET");
|
|
}
|
|
|
|
/* Disconnect the current client */
|
|
void quitCommand(client *c) {
|
|
addReply(c, shared.ok);
|
|
c->flag.close_after_reply = 1;
|
|
}
|
|
|
|
static int parseClientFiltersOrReply(client *c, int index, clientFilter *filter) {
|
|
while (index < c->argc) {
|
|
int moreargs = c->argc > index + 1;
|
|
|
|
if (!strcasecmp(c->argv[index]->ptr, "id")) {
|
|
if (filter->ids == NULL) {
|
|
/* Initialize the intset for IDs */
|
|
filter->ids = intsetNew();
|
|
}
|
|
index++; /* Move to the first ID after "ID" */
|
|
|
|
/* Process all IDs until a non-numeric argument or end of args */
|
|
while (index < c->argc) {
|
|
long long id;
|
|
if (!string2ll(c->argv[index]->ptr, sdslen(c->argv[index]->ptr), &id)) {
|
|
break; /* Stop processing IDs if a non-numeric argument is encountered */
|
|
}
|
|
if (id < 1) {
|
|
addReplyError(c, "client-id should be greater than 0");
|
|
return C_ERR;
|
|
}
|
|
|
|
uint8_t added;
|
|
filter->ids = intsetAdd(filter->ids, id, &added);
|
|
index++; /* Move to the next argument */
|
|
}
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "not-id")) {
|
|
if (filter->not_ids == NULL) {
|
|
/* Initialize the intset for NOT-IDs */
|
|
filter->not_ids = intsetNew();
|
|
}
|
|
index++; /* Move to the first ID after "NOT-ID" */
|
|
|
|
/* Process all NOT-IDs until a non-numeric argument or end of args */
|
|
while (index < c->argc) {
|
|
long long not_id;
|
|
if (!string2ll(c->argv[index]->ptr, sdslen(c->argv[index]->ptr), ¬_id)) {
|
|
break; /* Stop processing NOT-IDs if a non-numeric argument is encountered */
|
|
}
|
|
if (not_id < 1) {
|
|
addReplyError(c, "client-id should be greater than 0");
|
|
return C_ERR;
|
|
}
|
|
|
|
uint8_t added;
|
|
filter->not_ids = intsetAdd(filter->not_ids, not_id, &added);
|
|
index++; /* Move to the next argument */
|
|
}
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "maxage") && moreargs) {
|
|
long long maxage;
|
|
|
|
if (getLongLongFromObjectOrReply(c, c->argv[index + 1], &maxage,
|
|
"maxage is not an integer or out of range") != C_OK)
|
|
return C_ERR;
|
|
if (maxage <= 0) {
|
|
addReplyError(c, "maxage should be greater than 0");
|
|
return C_ERR;
|
|
}
|
|
|
|
filter->max_age = maxage;
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "type") && moreargs) {
|
|
filter->type = getClientTypeByName(c->argv[index + 1]->ptr);
|
|
if (filter->type == -1) {
|
|
addReplyErrorFormat(c, "Unknown client type '%s'", (char *)c->argv[index + 1]->ptr);
|
|
return C_ERR;
|
|
}
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "not-type") && moreargs) {
|
|
filter->not_type = getClientTypeByName(c->argv[index + 1]->ptr);
|
|
if (filter->not_type == -1) {
|
|
addReplyErrorFormat(c, "Unknown client type '%s'", (char *)c->argv[index + 1]->ptr);
|
|
return C_ERR;
|
|
}
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "addr") && moreargs) {
|
|
filter->addr = c->argv[index + 1]->ptr;
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "not-addr") && moreargs) {
|
|
filter->not_addr = c->argv[index + 1]->ptr;
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "laddr") && moreargs) {
|
|
filter->laddr = c->argv[index + 1]->ptr;
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "not-laddr") && moreargs) {
|
|
filter->not_laddr = c->argv[index + 1]->ptr;
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "user") && moreargs) {
|
|
filter->user = ACLGetUserByName(c->argv[index + 1]->ptr, sdslen(c->argv[index + 1]->ptr));
|
|
if (filter->user == NULL) {
|
|
addReplyErrorFormat(c, "No such user '%s'", (char *)c->argv[index + 1]->ptr);
|
|
return C_ERR;
|
|
}
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "not-user") && moreargs) {
|
|
filter->not_user = ACLGetUserByName(c->argv[index + 1]->ptr, sdslen(c->argv[index + 1]->ptr));
|
|
if (filter->not_user == NULL) {
|
|
addReplyErrorFormat(c, "No such user '%s'", (char *)c->argv[index + 1]->ptr);
|
|
return C_ERR;
|
|
}
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "skipme") && moreargs) {
|
|
if (!strcasecmp(c->argv[index + 1]->ptr, "yes")) {
|
|
filter->skipme = 1;
|
|
} else if (!strcasecmp(c->argv[index + 1]->ptr, "no")) {
|
|
filter->skipme = 0;
|
|
} else {
|
|
addReplyErrorObject(c, shared.syntaxerr);
|
|
return C_ERR;
|
|
}
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "idle") && moreargs) {
|
|
long long idle_time;
|
|
|
|
if (getLongLongFromObjectOrReply(c, c->argv[index + 1], &idle_time,
|
|
"idle is not an integer or out of range") != C_OK)
|
|
return C_ERR;
|
|
if (idle_time <= 0) {
|
|
addReplyError(c, "idle should be greater than 0");
|
|
return C_ERR;
|
|
}
|
|
|
|
filter->idle = idle_time;
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "flags") && moreargs) {
|
|
if (filter->flags) {
|
|
sdsfree(filter->flags);
|
|
filter->flags = NULL;
|
|
}
|
|
filter->flags = sdsnew(c->argv[index + 1]->ptr);
|
|
if (validateClientFlagFilter(filter->flags) == C_ERR) {
|
|
addReplyErrorFormat(c, "Unknown flags found in the provided filter: %s", filter->flags);
|
|
return C_ERR;
|
|
}
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "not-flags") && moreargs) {
|
|
if (filter->not_flags) {
|
|
sdsfree(filter->not_flags);
|
|
filter->not_flags = NULL;
|
|
}
|
|
filter->not_flags = sdsnew(c->argv[index + 1]->ptr);
|
|
if (validateClientFlagFilter(filter->not_flags) == C_ERR) {
|
|
addReplyErrorFormat(c, "Unknown flags found in the NOT-FLAGS filter: %s", filter->not_flags);
|
|
return C_ERR;
|
|
}
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "name") && moreargs) {
|
|
filter->name = c->argv[index + 1]->ptr;
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "not-name") && moreargs) {
|
|
filter->not_name = c->argv[index + 1]->ptr;
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "lib-name") && moreargs) {
|
|
if (filter->lib_name) {
|
|
decrRefCount(filter->lib_name);
|
|
filter->lib_name = NULL;
|
|
}
|
|
filter->lib_name = c->argv[index + 1];
|
|
incrRefCount(filter->lib_name);
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "not-lib-name") && moreargs) {
|
|
if (filter->not_lib_name) {
|
|
decrRefCount(filter->not_lib_name);
|
|
filter->not_lib_name = NULL;
|
|
}
|
|
filter->not_lib_name = c->argv[index + 1];
|
|
incrRefCount(filter->not_lib_name);
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "lib-ver") && moreargs) {
|
|
if (filter->lib_ver) {
|
|
decrRefCount(filter->lib_ver);
|
|
filter->lib_ver = NULL;
|
|
}
|
|
filter->lib_ver = c->argv[index + 1];
|
|
incrRefCount(filter->lib_ver);
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "not-lib-ver") && moreargs) {
|
|
if (filter->not_lib_ver) {
|
|
decrRefCount(filter->not_lib_ver);
|
|
filter->not_lib_ver = NULL;
|
|
}
|
|
filter->not_lib_ver = c->argv[index + 1];
|
|
incrRefCount(filter->not_lib_ver);
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "db") && moreargs) {
|
|
int db_id;
|
|
if (getIntFromObjectOrReply(c, c->argv[index + 1], &db_id,
|
|
"DB is not an integer or out of range") != C_OK)
|
|
return C_ERR;
|
|
if (db_id < 0 || db_id >= server.dbnum) {
|
|
addReplyErrorFormat(c, "DB number should be between 0 and %d", server.dbnum - 1);
|
|
return C_ERR;
|
|
}
|
|
filter->db_number = db_id;
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "not-db") && moreargs) {
|
|
int not_db_id;
|
|
if (getIntFromObjectOrReply(c, c->argv[index + 1], ¬_db_id,
|
|
"NOT-DB is not an integer or out of range") != C_OK)
|
|
return C_ERR;
|
|
if (not_db_id < 0 || not_db_id >= server.dbnum) {
|
|
addReplyErrorFormat(c, "NOT-DB number should be between 0 and %d", server.dbnum - 1);
|
|
return C_ERR;
|
|
}
|
|
filter->not_db_number = not_db_id;
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "capa") && moreargs) {
|
|
if (filter->capa) {
|
|
sdsfree(filter->capa);
|
|
filter->capa = NULL;
|
|
}
|
|
filter->capa = sdsnew(c->argv[index + 1]->ptr);
|
|
if (validateClientCapaFilter(filter->capa) == C_ERR) {
|
|
addReplyErrorFormat(c, "Unknown capa found in the provided filter: %s", filter->capa);
|
|
return C_ERR;
|
|
}
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "not-capa") && moreargs) {
|
|
if (filter->not_capa) {
|
|
sdsfree(filter->not_capa);
|
|
filter->not_capa = NULL;
|
|
}
|
|
filter->not_capa = sdsnew(c->argv[index + 1]->ptr);
|
|
if (validateClientCapaFilter(filter->not_capa) == C_ERR) {
|
|
addReplyErrorFormat(c, "Unknown capa found in the NOT-CAPA filter: %s", filter->not_capa);
|
|
return C_ERR;
|
|
}
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "ip") && moreargs) {
|
|
if (filter->ip) {
|
|
sdsfree(filter->ip);
|
|
filter->ip = NULL;
|
|
}
|
|
filter->ip = sdsnew(c->argv[index + 1]->ptr);
|
|
index += 2;
|
|
} else if (!strcasecmp(c->argv[index]->ptr, "not-ip") && moreargs) {
|
|
if (filter->not_ip) {
|
|
sdsfree(filter->not_ip);
|
|
filter->not_ip = NULL;
|
|
}
|
|
filter->not_ip = sdsnew(c->argv[index + 1]->ptr);
|
|
index += 2;
|
|
} else {
|
|
addReplyErrorObject(c, shared.syntaxerr);
|
|
return C_ERR;
|
|
}
|
|
}
|
|
return C_OK;
|
|
}
|
|
|
|
static int validateClientCapaFilter(sds capa) {
|
|
for (size_t i = 0; i < sdslen(capa); i++) {
|
|
const char capability = capa[i];
|
|
switch (capability) {
|
|
case 'r':
|
|
/* Valid capability, do nothing. */
|
|
break;
|
|
default:
|
|
return C_ERR;
|
|
}
|
|
}
|
|
return C_OK;
|
|
}
|
|
|
|
static int validateClientFlagFilter(sds flag_filter) {
|
|
for (size_t i = 0; i < sdslen(flag_filter); i++) {
|
|
const char flag = flag_filter[i];
|
|
switch (flag) {
|
|
case 'O':
|
|
case 'S':
|
|
case 'M':
|
|
case 'P':
|
|
case 'x':
|
|
case 'b':
|
|
case 't':
|
|
case 'R':
|
|
case 'B':
|
|
case 'd':
|
|
case 'c':
|
|
case 'u':
|
|
case 'A':
|
|
case 'U':
|
|
case 'r':
|
|
case 'e':
|
|
case 'T':
|
|
case 'I':
|
|
case 'i':
|
|
case 'E':
|
|
case 'N':
|
|
/* Valid flag, do nothing. */
|
|
break;
|
|
default:
|
|
return C_ERR;
|
|
}
|
|
}
|
|
return C_OK;
|
|
}
|
|
|
|
|
|
static int clientMatchesFilter(client *client, clientFilter *client_filter) {
|
|
/* Check each filter condition and return false if the client does not match. */
|
|
if (client_filter->addr && strcmp(getClientPeerId(client), client_filter->addr) != 0) return 0;
|
|
if (client_filter->laddr && strcmp(getClientSockname(client), client_filter->laddr) != 0) return 0;
|
|
if (client_filter->type != -1 && getClientType(client) != client_filter->type) return 0;
|
|
if (client_filter->ids && !intsetFind(client_filter->ids, client->id)) return 0;
|
|
if (client_filter->user && client->user != client_filter->user) return 0;
|
|
if (client_filter->skipme && client == server.current_client) return 0;
|
|
if (client_filter->max_age != 0 && (long long)(commandTimeSnapshot() / 1000 - client->ctime) < client_filter->max_age) return 0;
|
|
if (client_filter->idle != 0 && (long long)(commandTimeSnapshot() / 1000 - client->last_interaction) < client_filter->idle) return 0;
|
|
if (client_filter->flags && clientMatchesFlagFilter(client, client_filter->flags) == 0) return 0;
|
|
if (client_filter->name) {
|
|
if (!client->name || !client->name->ptr || strcmp(client->name->ptr, client_filter->name) != 0) {
|
|
return 0;
|
|
}
|
|
}
|
|
if (client_filter->lib_name && (!client->lib_name || compareStringObjects(client->lib_name, client_filter->lib_name) != 0)) return 0;
|
|
if (client_filter->lib_ver && (!client->lib_ver || compareStringObjects(client->lib_ver, client_filter->lib_ver) != 0)) return 0;
|
|
if (client_filter->db_number != -1 && client->db->id != client_filter->db_number) return 0;
|
|
if (client_filter->capa && clientMatchesCapaFilter(client, client_filter->capa) == 0) return 0;
|
|
if (client_filter->ip && clientMatchesIpFilter(client, client_filter->ip) == 0) return 0;
|
|
|
|
/* Check each negative filter condition and return false if the client matches. */
|
|
if (client_filter->not_addr && strcmp(getClientPeerId(client), client_filter->not_addr) == 0) return 0;
|
|
if (client_filter->not_laddr && strcmp(getClientSockname(client), client_filter->not_laddr) == 0) return 0;
|
|
if (client_filter->not_type != -1 && getClientType(client) == client_filter->not_type) return 0;
|
|
if (client_filter->not_ids && intsetFind(client_filter->not_ids, client->id)) return 0;
|
|
if (client_filter->not_user && client->user == client_filter->not_user) return 0;
|
|
if (client_filter->not_flags && clientMatchesFlagFilter(client, client_filter->not_flags) != 0) return 0;
|
|
if (client_filter->not_name) {
|
|
if (client->name && client->name->ptr && strcmp(client->name->ptr, client_filter->not_name) == 0) {
|
|
return 0;
|
|
}
|
|
}
|
|
if (client_filter->not_lib_name && (client->lib_name && compareStringObjects(client->lib_name, client_filter->not_lib_name) == 0)) return 0;
|
|
if (client_filter->not_lib_ver && (client->lib_ver && compareStringObjects(client->lib_ver, client_filter->not_lib_ver) == 0)) return 0;
|
|
if (client_filter->not_db_number != -1 && client->db->id == client_filter->not_db_number) return 0;
|
|
if (client_filter->not_capa && clientMatchesCapaFilter(client, client_filter->not_capa) != 0) return 0;
|
|
if (client_filter->not_ip && clientMatchesIpFilter(client, client_filter->not_ip) != 0) return 0;
|
|
|
|
/* If all conditions are satisfied, the client matches the filter. */
|
|
return 1;
|
|
}
|
|
|
|
static int clientMatchesIpFilter(client *c, sds ip) {
|
|
const char *peerid = getClientPeerId(c);
|
|
if (!peerid) return 0;
|
|
|
|
if (peerid[0] == '[') peerid++; /* IPv6 wrapped in square brackets */
|
|
size_t len = sdslen(ip);
|
|
if (strncmp(peerid, ip, len) != 0) return 0;
|
|
|
|
peerid += len;
|
|
if (peerid[0] == ']') peerid++; /* Skip trailing ] for IPv6 */
|
|
|
|
if (peerid[0] != ':') return 0; /* IP:port colon check */
|
|
peerid++;
|
|
|
|
if (peerid[0] == '0') return 0; /* Disallow port=0 */
|
|
return 1;
|
|
}
|
|
|
|
static int clientMatchesCapaFilter(client *c, sds capa_filter) {
|
|
/* Iterate through the provided capa filter string */
|
|
for (size_t i = 0; i < sdslen(capa_filter); i++) {
|
|
const char capability = capa_filter[i];
|
|
|
|
/* Check each capability */
|
|
switch (capability) {
|
|
case 'r': /* client supports redirection */
|
|
if (!(c->capa & CLIENT_CAPA_REDIRECT)) return 0;
|
|
break;
|
|
default:
|
|
/* Invalid capa, return false */
|
|
return 0;
|
|
}
|
|
}
|
|
/* If the loop completes, the client matches the capa filter */
|
|
return 1;
|
|
}
|
|
|
|
|
|
static int clientMatchesFlagFilter(client *c, sds flag_filter) {
|
|
/* Iterate through the provided flag filter string */
|
|
for (size_t i = 0; i < sdslen(flag_filter); i++) {
|
|
const char flag = flag_filter[i];
|
|
|
|
/* Check each flag */
|
|
switch (flag) {
|
|
case 'O': /* client in MONITOR mode */
|
|
if (!(c->flag.replica && c->flag.monitor)) return 0;
|
|
break;
|
|
case 'S': /* client is a replica node connection to this instance */
|
|
if (!c->flag.replica) return 0;
|
|
break;
|
|
case 'M': /* client is a primary */
|
|
if (!c->flag.primary) return 0;
|
|
break;
|
|
case 'P': /* client is a Pub/Sub subscriber */
|
|
if (!c->flag.pubsub) return 0;
|
|
break;
|
|
case 'x': /* client is in a MULTI/EXEC context */
|
|
if (!c->flag.multi) return 0;
|
|
break;
|
|
case 'b': /* client is waiting in a blocking operation */
|
|
if (!c->flag.blocked) return 0;
|
|
break;
|
|
case 't': /* client enabled keys tracking in order to perform client side caching */
|
|
if (!c->flag.tracking) return 0;
|
|
break;
|
|
case 'R': /* Client tracking target client is invalid */
|
|
if (!c->flag.tracking_broken_redir) return 0;
|
|
break;
|
|
case 'B': /* client enabled broadcast tracking mode */
|
|
if (!c->flag.tracking_bcast) return 0;
|
|
break;
|
|
case 'd': /* Dirty CAS */
|
|
if (!c->flag.dirty_cas) return 0;
|
|
break;
|
|
case 'c': /* Close after reply */
|
|
if (!c->flag.close_after_reply) return 0;
|
|
break;
|
|
case 'u': /* client is unblocked */
|
|
if (!c->flag.unblocked) return 0;
|
|
break;
|
|
case 'A': /* Close ASAP */
|
|
if (!c->flag.close_asap) return 0;
|
|
break;
|
|
case 'U': /* client is connected via a Unix domain socket */
|
|
if (!c->flag.unix_socket) return 0;
|
|
break;
|
|
case 'r': /* client is in readonly mode against a cluster node */
|
|
if (!c->flag.readonly) return 0;
|
|
break;
|
|
case 'e': /* client is excluded from the client eviction mechanism */
|
|
if (!c->flag.no_evict) return 0;
|
|
break;
|
|
case 'T': /* client will not touch the LRU/LFU of the keys it accesses */
|
|
if (!c->flag.no_touch) return 0;
|
|
break;
|
|
case 'I': /* Import source flag */
|
|
if (!c->flag.import_source) return 0;
|
|
break;
|
|
case 'i': /* Slot migration import flag */
|
|
if (!c->slot_migration_job || !isImportSlotMigrationJob(c->slot_migration_job)) return 0;
|
|
break;
|
|
case 'E': /* Slot migration export flag */
|
|
if (!c->slot_migration_job || isImportSlotMigrationJob(c->slot_migration_job)) return 0;
|
|
break;
|
|
case 'N': /* Check for no flags */
|
|
if (c->flag.replica || c->flag.primary || c->flag.pubsub ||
|
|
c->flag.multi || c->flag.blocked || c->flag.tracking ||
|
|
c->flag.tracking_broken_redir || c->flag.tracking_bcast ||
|
|
c->flag.dirty_cas || c->flag.close_after_reply ||
|
|
c->flag.unblocked || c->flag.close_asap ||
|
|
c->flag.unix_socket || c->flag.readonly ||
|
|
c->flag.no_evict || c->flag.no_touch ||
|
|
c->flag.import_source || c->slot_migration_job) {
|
|
return 0;
|
|
}
|
|
break;
|
|
default:
|
|
/* Invalid flag, return false */
|
|
return 0;
|
|
}
|
|
}
|
|
/* If the loop completes, the client matches the flag filter */
|
|
return 1;
|
|
}
|
|
|
|
|
|
void clientHelpCommand(client *c) {
|
|
const char *help[] = {
|
|
"CACHING (YES|NO)",
|
|
" Enable/disable tracking of the keys for next command in OPTIN/OPTOUT modes.",
|
|
"CAPA <option> [options...]",
|
|
" The client claims its some capability options. Options are:",
|
|
" * REDIRECT",
|
|
" The client can handle redirection during primary and replica failover in standalone mode.",
|
|
"GETREDIR",
|
|
" Return the client ID we are redirecting to when tracking is enabled.",
|
|
"GETNAME",
|
|
" Return the name of the current connection.",
|
|
"ID",
|
|
" Return the ID of the current connection.",
|
|
"INFO",
|
|
" Return information about the current client connection.",
|
|
"KILL <ip:port>",
|
|
" Kill connection made from <ip:port>.",
|
|
"KILL <option> <value> [<option> <value> [...]]",
|
|
" Kill connections. Options are:",
|
|
" * ADDR (<ip:port>|<unixsocket>:0)",
|
|
" Kill connections made from the specified address.",
|
|
" * LADDR (<ip:port>|<unixsocket>:0)",
|
|
" Kill connections made to the specified local address.",
|
|
" * TYPE (NORMAL|PRIMARY|REPLICA|PUBSUB)",
|
|
" Kill connections by type.",
|
|
" * USER <username>",
|
|
" Kill connections authenticated by <username>.",
|
|
" * SKIPME (YES|NO)",
|
|
" Skip killing current connection (default: yes).",
|
|
" * ID <client-id> [<client-id>...]",
|
|
" Kill connections by client IDs.",
|
|
" * MAXAGE <maxage>",
|
|
" Kill connections older than the specified age.",
|
|
" * FLAGS <flags>",
|
|
" Kill connections that include the specified flags.",
|
|
" * NAME <client-name>",
|
|
" Kill connections with the specified name.",
|
|
" * IDLE <idle>",
|
|
" Kill connections with idle time greater than or equal to <idle> seconds.",
|
|
" * LIB-NAME <library-name>",
|
|
" Kill connections with the specified library name.",
|
|
" * LIB-VER <library-version>",
|
|
" Kill connections with the specified library version.",
|
|
" * DB <db-id>",
|
|
" Kill connections currently operating on the specified database ID.",
|
|
" * CAPA <capa>",
|
|
" Kill connections currently with the specified capa.",
|
|
" * IP <ip>",
|
|
" Kill connections made from the specified ip.",
|
|
"LIST [options ...]",
|
|
" Return information about client connections. Options:",
|
|
" * TYPE (NORMAL|PRIMARY|REPLICA|PUBSUB)",
|
|
" Return clients of specified type.",
|
|
" * USER <username>",
|
|
" Return clients authenticated by <username>.",
|
|
" * ADDR <ip:port>",
|
|
" Return clients connected from the specified address.",
|
|
" * LADDR <ip:port>",
|
|
" Return clients connected to the specified local address.",
|
|
" * ID <client-id> [<client-id>...]",
|
|
" Return clients with the specified IDs.",
|
|
" * SKIPME (YES|NO)",
|
|
" Exclude the current client from the list (default: no).",
|
|
" * MAXAGE <maxage>",
|
|
" List connections older than the specified age.",
|
|
" * FLAGS <flags>",
|
|
" Return clients with the specified flags.",
|
|
" * NAME <client-name>",
|
|
" Return clients with the specified name.",
|
|
" * IDLE <idle>",
|
|
" Return clients with idle time greater than or equal to <idle> seconds.",
|
|
" * LIB-NAME <lib-name>",
|
|
" Return clients with the specified lib name.",
|
|
" * LIB-VER <lib-version>",
|
|
" Return clients with the specified lib version.",
|
|
" * DB <db-id>",
|
|
" Return clients currently operating on the specified database ID.",
|
|
" * CAPA <capa>",
|
|
" Return connections currently with the specified capa.",
|
|
" * IP <ip>",
|
|
" Return connections made from the specified ip.",
|
|
"UNPAUSE",
|
|
" Stop the current client pause, resuming traffic.",
|
|
"PAUSE <timeout> [WRITE|ALL]",
|
|
" Suspend all, or just write, clients for <timeout> milliseconds.",
|
|
"REPLY (ON|OFF|SKIP)",
|
|
" Control the replies sent to the current connection.",
|
|
"SETNAME <name>",
|
|
" Assign the name <name> to the current connection.",
|
|
"SETINFO <option> <value>",
|
|
" Set client meta attr. Options are:",
|
|
" * LIB-NAME: the client lib name.",
|
|
" * LIB-VER: the client lib version.",
|
|
"UNBLOCK <clientid> [TIMEOUT|ERROR]",
|
|
" Unblock the specified blocked client.",
|
|
"TRACKING (ON|OFF) [REDIRECT <id>] [BCAST] [PREFIX <prefix> [...]]",
|
|
" [OPTIN] [OPTOUT] [NOLOOP]",
|
|
" Control server assisted client side caching.",
|
|
"TRACKINGINFO",
|
|
" Report tracking status for the current connection.",
|
|
"NO-EVICT (ON|OFF)",
|
|
" Protect current client connection from eviction.",
|
|
"NO-TOUCH (ON|OFF)",
|
|
" Will not touch LRU/LFU stats when this mode is on.",
|
|
"IMPORT-SOURCE (ON|OFF)",
|
|
" Mark this connection as an import source if import-mode is enabled.",
|
|
" Sync tools can set their connections into 'import-source' state to visit",
|
|
" expired keys.",
|
|
NULL};
|
|
addReplyHelp(c, help);
|
|
}
|
|
|
|
void clientIDCommand(client *c) {
|
|
addReplyLongLong(c, c->id);
|
|
}
|
|
|
|
void clientInfoCommand(client *c) {
|
|
sds info = catClientInfoString(sdsempty(), c, 0);
|
|
info = sdscatlen(info, "\n", 1);
|
|
addReplyVerbatim(c, info, sdslen(info), "txt");
|
|
sdsfree(info);
|
|
}
|
|
|
|
void clientListCommand(client *c) {
|
|
int type = -1;
|
|
sds response = NULL;
|
|
|
|
if (c->argc > 3) {
|
|
clientFilter filter = {0};
|
|
filter.type = -1;
|
|
filter.not_type = -1;
|
|
filter.db_number = -1;
|
|
filter.not_db_number = -1;
|
|
|
|
int i = 2;
|
|
|
|
if (parseClientFiltersOrReply(c, i, &filter) != C_OK) {
|
|
freeClientFilter(&filter);
|
|
return;
|
|
}
|
|
response = getAllFilteredClientsInfoString(&filter, 0);
|
|
freeClientFilter(&filter);
|
|
} else if (c->argc != 2) {
|
|
addReplyErrorObject(c, shared.syntaxerr);
|
|
return;
|
|
}
|
|
|
|
if (!response) response = getAllClientsInfoString(type, 0);
|
|
addReplyVerbatim(c, response, sdslen(response), "txt");
|
|
sdsfree(response);
|
|
}
|
|
|
|
void clientReplyCommand(client *c) {
|
|
/* CLIENT REPLY ON|OFF|SKIP */
|
|
if (!strcasecmp(c->argv[2]->ptr, "on")) {
|
|
c->flag.reply_skip = 0;
|
|
c->flag.reply_off = 0;
|
|
addReply(c, shared.ok);
|
|
} else if (!strcasecmp(c->argv[2]->ptr, "off")) {
|
|
c->flag.reply_off = 1;
|
|
} else if (!strcasecmp(c->argv[2]->ptr, "skip")) {
|
|
if (!c->flag.reply_off) c->flag.reply_skip_next = 1;
|
|
} else {
|
|
addReplyErrorObject(c, shared.syntaxerr);
|
|
return;
|
|
}
|
|
}
|
|
|
|
void clientNoEvictCommand(client *c) {
|
|
/* CLIENT NO-EVICT ON|OFF */
|
|
if (!strcasecmp(c->argv[2]->ptr, "on")) {
|
|
c->flag.no_evict = 1;
|
|
removeClientFromMemUsageBucket(c, 0);
|
|
addReply(c, shared.ok);
|
|
} else if (!strcasecmp(c->argv[2]->ptr, "off")) {
|
|
c->flag.no_evict = 0;
|
|
updateClientMemUsageAndBucket(c);
|
|
addReply(c, shared.ok);
|
|
} else {
|
|
addReplyErrorObject(c, shared.syntaxerr);
|
|
return;
|
|
}
|
|
}
|
|
|
|
void clientKillCommand(client *c) {
|
|
/* CLIENT KILL <ip:port>
|
|
* CLIENT KILL <option> [value] ... <option> [value] */
|
|
|
|
listNode *ln;
|
|
listIter li;
|
|
|
|
clientFilter client_filter = {0};
|
|
client_filter.type = -1;
|
|
client_filter.not_type = -1;
|
|
client_filter.db_number = -1;
|
|
client_filter.not_db_number = -1;
|
|
client_filter.skipme = 1;
|
|
|
|
int killed = 0, close_this_client = 0;
|
|
|
|
if (c->argc == 3) {
|
|
/* Old style syntax: CLIENT KILL <addr> */
|
|
client_filter.addr = c->argv[2]->ptr;
|
|
client_filter.skipme = 0; /* With the old form, you can kill yourself. */
|
|
} else if (c->argc > 3) {
|
|
int i = 2; /* Next option index. */
|
|
|
|
/* New style syntax: parse options. */
|
|
if (parseClientFiltersOrReply(c, i, &client_filter) != C_OK) {
|
|
/* Free the intset on error */
|
|
goto client_kill_done;
|
|
}
|
|
} else {
|
|
addReplyErrorObject(c, shared.syntaxerr);
|
|
/* Free the intset on error */
|
|
goto client_kill_done;
|
|
}
|
|
|
|
/* Iterate clients killing all the matching clients. */
|
|
listRewind(server.clients, &li);
|
|
while ((ln = listNext(&li)) != NULL) {
|
|
client *client = listNodeValue(ln);
|
|
if (!clientMatchesFilter(client, &client_filter)) continue;
|
|
|
|
/* Kill it. */
|
|
if (c == client) {
|
|
close_this_client = 1;
|
|
} else {
|
|
freeClient(client);
|
|
}
|
|
killed++;
|
|
}
|
|
|
|
/* Reply according to old/new format. */
|
|
if (c->argc == 3) {
|
|
if (killed == 0)
|
|
addReplyError(c, "No such client");
|
|
else
|
|
addReply(c, shared.ok);
|
|
} else {
|
|
addReplyLongLong(c, killed);
|
|
}
|
|
|
|
/* If this client has to be closed, flag it as CLOSE_AFTER_REPLY
|
|
* only after we queued the reply to its output buffers. */
|
|
if (close_this_client) c->flag.close_after_reply = 1;
|
|
client_kill_done:
|
|
freeClientFilter(&client_filter);
|
|
}
|
|
|
|
static void freeClientFilter(clientFilter *filter) {
|
|
if (filter->ids != NULL) {
|
|
zfree(filter->ids);
|
|
filter->ids = NULL;
|
|
}
|
|
if (filter->flags != NULL) {
|
|
sdsfree(filter->flags);
|
|
filter->flags = NULL;
|
|
}
|
|
if (filter->capa != NULL) {
|
|
sdsfree(filter->capa);
|
|
filter->capa = NULL;
|
|
}
|
|
if (filter->ip != NULL) {
|
|
sdsfree(filter->ip);
|
|
filter->ip = NULL;
|
|
}
|
|
if (filter->lib_name) {
|
|
decrRefCount(filter->lib_name);
|
|
filter->lib_name = NULL;
|
|
}
|
|
if (filter->lib_ver) {
|
|
decrRefCount(filter->lib_ver);
|
|
filter->lib_ver = NULL;
|
|
}
|
|
|
|
if (filter->not_ids != NULL) {
|
|
zfree(filter->not_ids);
|
|
filter->not_ids = NULL;
|
|
}
|
|
if (filter->not_flags != NULL) {
|
|
sdsfree(filter->not_flags);
|
|
filter->not_flags = NULL;
|
|
}
|
|
if (filter->not_capa != NULL) {
|
|
sdsfree(filter->not_capa);
|
|
filter->not_capa = NULL;
|
|
}
|
|
if (filter->not_ip != NULL) {
|
|
sdsfree(filter->not_ip);
|
|
filter->not_ip = NULL;
|
|
}
|
|
if (filter->not_lib_name) {
|
|
decrRefCount(filter->not_lib_name);
|
|
filter->not_lib_name = NULL;
|
|
}
|
|
if (filter->not_lib_ver) {
|
|
decrRefCount(filter->not_lib_ver);
|
|
filter->not_lib_ver = NULL;
|
|
}
|
|
}
|
|
|
|
|
|
void clientUnblockCommand(client *c) {
|
|
/* CLIENT UNBLOCK <id> [timeout|error] */
|
|
long long id;
|
|
int unblock_error = 0;
|
|
|
|
if (c->argc == 4) {
|
|
if (!strcasecmp(c->argv[3]->ptr, "timeout")) {
|
|
unblock_error = 0;
|
|
} else if (!strcasecmp(c->argv[3]->ptr, "error")) {
|
|
unblock_error = 1;
|
|
} else {
|
|
addReplyError(c, "CLIENT UNBLOCK reason should be TIMEOUT or ERROR");
|
|
return;
|
|
}
|
|
}
|
|
if (getLongLongFromObjectOrReply(c, c->argv[2], &id, NULL) != C_OK) return;
|
|
struct client *target = lookupClientByID(id);
|
|
/* Note that we never try to unblock a client blocked on a module command,
|
|
* or a client blocked by CLIENT PAUSE or some other blocking type which
|
|
* doesn't have a timeout callback (even in the case of UNBLOCK ERROR).
|
|
* The reason is that we assume that if a command doesn't expect to be timedout,
|
|
* it also doesn't expect to be unblocked by CLIENT UNBLOCK */
|
|
if (target && target->flag.blocked && blockedClientMayTimeout(target)) {
|
|
if (unblock_error)
|
|
unblockClientOnError(target, "-UNBLOCKED client unblocked via CLIENT UNBLOCK");
|
|
else
|
|
unblockClientOnTimeout(target);
|
|
|
|
addReply(c, shared.cone);
|
|
} else {
|
|
addReply(c, shared.czero);
|
|
}
|
|
}
|
|
|
|
void clientSetNameCommand(client *c) {
|
|
/* CLIENT SETNAME */
|
|
if (clientSetNameOrReply(c, c->argv[2]) == C_OK)
|
|
addReply(c, shared.ok);
|
|
}
|
|
|
|
void clientGetNameCommand(client *c) {
|
|
/* CLIENT GETNAME */
|
|
if (c->name)
|
|
addReplyBulk(c, c->name);
|
|
else
|
|
addReplyNull(c);
|
|
}
|
|
|
|
void clientUnpauseCommand(client *c) {
|
|
/* CLIENT UNPAUSE */
|
|
unpauseActions(PAUSE_BY_CLIENT_COMMAND);
|
|
addReply(c, shared.ok);
|
|
}
|
|
|
|
void clientPauseCommand(client *c) {
|
|
/* CLIENT PAUSE TIMEOUT [WRITE|ALL] */
|
|
mstime_t end;
|
|
int isPauseClientAll = 1;
|
|
if (c->argc == 4) {
|
|
if (!strcasecmp(c->argv[3]->ptr, "write")) {
|
|
isPauseClientAll = 0;
|
|
} else if (strcasecmp(c->argv[3]->ptr, "all")) {
|
|
addReplyError(c, "CLIENT PAUSE mode must be WRITE or ALL");
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (getTimeoutFromObjectOrReply(c, c->argv[2], &end, UNIT_MILLISECONDS) != C_OK) return;
|
|
pauseClientsByClient(end, isPauseClientAll);
|
|
addReply(c, shared.ok);
|
|
}
|
|
|
|
void clientTrackingCommand(client *c) {
|
|
/* CLIENT TRACKING (on|off) [REDIRECT <id>] [BCAST] [PREFIX first]
|
|
* [PREFIX second] [OPTIN] [OPTOUT] [NOLOOP]... */
|
|
long long redir = 0;
|
|
struct ClientFlags options = {0};
|
|
robj **prefix = NULL;
|
|
size_t numprefix = 0;
|
|
initClientPubSubData(c);
|
|
|
|
/* Parse the options. */
|
|
for (int j = 3; j < c->argc; j++) {
|
|
int moreargs = (c->argc - 1) - j;
|
|
|
|
if (!strcasecmp(c->argv[j]->ptr, "redirect") && moreargs) {
|
|
j++;
|
|
if (redir != 0) {
|
|
addReplyError(c, "A client can only redirect to a single "
|
|
"other client");
|
|
zfree(prefix);
|
|
return;
|
|
}
|
|
|
|
if (getLongLongFromObjectOrReply(c, c->argv[j], &redir, NULL) != C_OK) {
|
|
zfree(prefix);
|
|
return;
|
|
}
|
|
/* We will require the client with the specified ID to exist
|
|
* right now, even if it is possible that it gets disconnected
|
|
* later. Still a valid sanity check. */
|
|
if (lookupClientByID(redir) == NULL) {
|
|
addReplyError(c, "The client ID you want redirect to "
|
|
"does not exist");
|
|
zfree(prefix);
|
|
return;
|
|
}
|
|
} else if (!strcasecmp(c->argv[j]->ptr, "bcast")) {
|
|
options.tracking_bcast = 1;
|
|
} else if (!strcasecmp(c->argv[j]->ptr, "optin")) {
|
|
options.tracking_optin = 1;
|
|
} else if (!strcasecmp(c->argv[j]->ptr, "optout")) {
|
|
options.tracking_optout = 1;
|
|
} else if (!strcasecmp(c->argv[j]->ptr, "noloop")) {
|
|
options.tracking_noloop = 1;
|
|
} else if (!strcasecmp(c->argv[j]->ptr, "prefix") && moreargs) {
|
|
j++;
|
|
prefix = zrealloc(prefix, sizeof(robj *) * (numprefix + 1));
|
|
prefix[numprefix++] = c->argv[j];
|
|
} else {
|
|
zfree(prefix);
|
|
addReplyErrorObject(c, shared.syntaxerr);
|
|
return;
|
|
}
|
|
}
|
|
|
|
/* Options are ok: enable or disable the tracking for this client. */
|
|
if (!strcasecmp(c->argv[2]->ptr, "on")) {
|
|
/* Before enabling tracking, make sure options are compatible
|
|
* among each other and with the current state of the client. */
|
|
if (!(options.tracking_bcast) && numprefix) {
|
|
addReplyError(c, "PREFIX option requires BCAST mode to be enabled");
|
|
zfree(prefix);
|
|
return;
|
|
}
|
|
|
|
if (c->flag.tracking) {
|
|
int oldbcast = !!c->flag.tracking_bcast;
|
|
int newbcast = !!(options.tracking_bcast);
|
|
if (oldbcast != newbcast) {
|
|
addReplyError(c, "You can't switch BCAST mode on/off before disabling "
|
|
"tracking for this client, and then re-enabling it with "
|
|
"a different mode.");
|
|
zfree(prefix);
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (options.tracking_bcast && (options.tracking_optin || options.tracking_optout)) {
|
|
addReplyError(c, "OPTIN and OPTOUT are not compatible with BCAST");
|
|
zfree(prefix);
|
|
return;
|
|
}
|
|
|
|
if (options.tracking_optin && options.tracking_optout) {
|
|
addReplyError(c, "You can't specify both OPTIN mode and OPTOUT mode");
|
|
zfree(prefix);
|
|
return;
|
|
}
|
|
|
|
if ((options.tracking_optin && c->flag.tracking_optout) ||
|
|
(options.tracking_optout && c->flag.tracking_optin)) {
|
|
addReplyError(c, "You can't switch OPTIN/OPTOUT mode before disabling "
|
|
"tracking for this client, and then re-enabling it with "
|
|
"a different mode.");
|
|
zfree(prefix);
|
|
return;
|
|
}
|
|
|
|
if (options.tracking_bcast) {
|
|
if (!checkPrefixCollisionsOrReply(c, prefix, numprefix)) {
|
|
zfree(prefix);
|
|
return;
|
|
}
|
|
}
|
|
|
|
enableTracking(c, redir, options, prefix, numprefix);
|
|
} else if (!strcasecmp(c->argv[2]->ptr, "off")) {
|
|
disableTracking(c);
|
|
} else {
|
|
zfree(prefix);
|
|
addReplyErrorObject(c, shared.syntaxerr);
|
|
return;
|
|
}
|
|
zfree(prefix);
|
|
addReply(c, shared.ok);
|
|
}
|
|
|
|
void clientCachingCommand(client *c) {
|
|
if (!c->flag.tracking) {
|
|
addReplyError(c, "CLIENT CACHING can be called only when the "
|
|
"client is in tracking mode with OPTIN or "
|
|
"OPTOUT mode enabled");
|
|
return;
|
|
}
|
|
|
|
char *opt = c->argv[2]->ptr;
|
|
if (!strcasecmp(opt, "yes")) {
|
|
if (c->flag.tracking_optin) {
|
|
c->flag.tracking_caching = 1;
|
|
} else {
|
|
addReplyError(c, "CLIENT CACHING YES is only valid when tracking is enabled in OPTIN mode.");
|
|
return;
|
|
}
|
|
} else if (!strcasecmp(opt, "no")) {
|
|
if (c->flag.tracking_optout) {
|
|
c->flag.tracking_caching = 1;
|
|
} else {
|
|
addReplyError(c, "CLIENT CACHING NO is only valid when tracking is enabled in OPTOUT mode.");
|
|
return;
|
|
}
|
|
} else {
|
|
addReplyErrorObject(c, shared.syntaxerr);
|
|
return;
|
|
}
|
|
|
|
/* Common reply for when we succeeded. */
|
|
addReply(c, shared.ok);
|
|
}
|
|
|
|
void clientGetredirCommand(client *c) {
|
|
/* CLIENT GETREDIR */
|
|
if (c->flag.tracking) {
|
|
addReplyLongLong(c, c->pubsub_data->client_tracking_redirection);
|
|
} else {
|
|
addReplyLongLong(c, -1);
|
|
}
|
|
}
|
|
|
|
void clientTrackingInfoCommand(client *c) {
|
|
addReplyMapLen(c, 3);
|
|
|
|
/* Flags */
|
|
addReplyBulkCString(c, "flags");
|
|
void *arraylen_ptr = addReplyDeferredLen(c);
|
|
int numflags = 0;
|
|
addReplyBulkCString(c, c->flag.tracking ? "on" : "off");
|
|
numflags++;
|
|
if (c->flag.tracking_bcast) {
|
|
addReplyBulkCString(c, "bcast");
|
|
numflags++;
|
|
}
|
|
if (c->flag.tracking_optin) {
|
|
addReplyBulkCString(c, "optin");
|
|
numflags++;
|
|
if (c->flag.tracking_caching) {
|
|
addReplyBulkCString(c, "caching-yes");
|
|
numflags++;
|
|
}
|
|
}
|
|
if (c->flag.tracking_optout) {
|
|
addReplyBulkCString(c, "optout");
|
|
numflags++;
|
|
if (c->flag.tracking_caching) {
|
|
addReplyBulkCString(c, "caching-no");
|
|
numflags++;
|
|
}
|
|
}
|
|
if (c->flag.tracking_noloop) {
|
|
addReplyBulkCString(c, "noloop");
|
|
numflags++;
|
|
}
|
|
if (c->flag.tracking_broken_redir) {
|
|
addReplyBulkCString(c, "broken_redirect");
|
|
numflags++;
|
|
}
|
|
setDeferredSetLen(c, arraylen_ptr, numflags);
|
|
|
|
/* Redirect */
|
|
addReplyBulkCString(c, "redirect");
|
|
if (c->flag.tracking) {
|
|
addReplyLongLong(c, c->pubsub_data->client_tracking_redirection);
|
|
} else {
|
|
addReplyLongLong(c, -1);
|
|
}
|
|
|
|
/* Prefixes */
|
|
addReplyBulkCString(c, "prefixes");
|
|
if (c->flag.tracking && c->pubsub_data->client_tracking_prefixes) {
|
|
addReplyArrayLen(c, raxSize(c->pubsub_data->client_tracking_prefixes));
|
|
raxIterator ri;
|
|
raxStart(&ri, c->pubsub_data->client_tracking_prefixes);
|
|
raxSeek(&ri, "^", NULL, 0);
|
|
while (raxNext(&ri)) {
|
|
addReplyBulkCBuffer(c, ri.key, ri.key_len);
|
|
}
|
|
raxStop(&ri);
|
|
} else {
|
|
addReplyArrayLen(c, 0);
|
|
}
|
|
}
|
|
|
|
void clientNoTouchCommand(client *c) {
|
|
/* CLIENT NO-TOUCH ON|OFF */
|
|
if (!strcasecmp(c->argv[2]->ptr, "on")) {
|
|
c->flag.no_touch = 1;
|
|
addReply(c, shared.ok);
|
|
} else if (!strcasecmp(c->argv[2]->ptr, "off")) {
|
|
c->flag.no_touch = 0;
|
|
addReply(c, shared.ok);
|
|
} else {
|
|
addReplyErrorObject(c, shared.syntaxerr);
|
|
}
|
|
}
|
|
|
|
void clientCapaCommand(client *c) {
|
|
for (int i = 2; i < c->argc; i++) {
|
|
if (!strcasecmp(c->argv[i]->ptr, "redirect")) {
|
|
c->capa |= CLIENT_CAPA_REDIRECT;
|
|
}
|
|
}
|
|
addReply(c, shared.ok);
|
|
}
|
|
|
|
void clientImportSourceCommand(client *c) {
|
|
/* CLIENT IMPORT-SOURCE ON|OFF */
|
|
if (!server.import_mode && strcasecmp(c->argv[2]->ptr, "off")) {
|
|
addReplyError(c, "Server is not in import mode");
|
|
return;
|
|
}
|
|
if (!strcasecmp(c->argv[2]->ptr, "on")) {
|
|
c->flag.import_source = 1;
|
|
addReply(c, shared.ok);
|
|
} else if (!strcasecmp(c->argv[2]->ptr, "off")) {
|
|
c->flag.import_source = 0;
|
|
addReply(c, shared.ok);
|
|
} else {
|
|
addReplyErrorObject(c, shared.syntaxerr);
|
|
return;
|
|
}
|
|
}
|
|
|
|
void clientCommand(client *c) {
|
|
addReplySubcommandSyntaxError(c);
|
|
}
|
|
|
|
/* HELLO [<protocol-version> [AUTH <user> <password>] [SETNAME <name>] ] */
|
|
void helloCommand(client *c) {
|
|
long long ver = 0;
|
|
int next_arg = 1;
|
|
|
|
if (c->argc >= 2) {
|
|
if (getLongLongFromObjectOrReply(c, c->argv[next_arg++], &ver,
|
|
"Protocol version is not an integer or out of range") != C_OK) {
|
|
return;
|
|
}
|
|
|
|
if (ver < 2 || ver > 3) {
|
|
addReplyError(c, "-NOPROTO unsupported protocol version");
|
|
return;
|
|
}
|
|
}
|
|
|
|
robj *username = NULL;
|
|
robj *password = NULL;
|
|
robj *clientname = NULL;
|
|
for (int j = next_arg; j < c->argc; j++) {
|
|
int moreargs = (c->argc - 1) - j;
|
|
const char *opt = c->argv[j]->ptr;
|
|
if (!strcasecmp(opt, "AUTH") && moreargs >= 2) {
|
|
redactClientCommandArgument(c, j + 1);
|
|
redactClientCommandArgument(c, j + 2);
|
|
username = c->argv[j + 1];
|
|
password = c->argv[j + 2];
|
|
j += 2;
|
|
} else if (!strcasecmp(opt, "SETNAME") && moreargs) {
|
|
clientname = c->argv[j + 1];
|
|
const char *err = NULL;
|
|
if (validateClientName(clientname, &err) == C_ERR) {
|
|
addReplyError(c, err);
|
|
return;
|
|
}
|
|
j++;
|
|
} else {
|
|
addReplyErrorFormat(c, "Syntax error in HELLO option '%s'", opt);
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (username && password) {
|
|
robj *err = NULL;
|
|
int auth_result = ACLAuthenticateUser(c, username, password, &err);
|
|
if (auth_result == AUTH_ERR) {
|
|
addAuthErrReply(c, err);
|
|
}
|
|
if (err) decrRefCount(err);
|
|
/* In case of auth errors, return early since we already replied with an ERR.
|
|
* In case of blocking module auth, we reply to the client/setname later upon unblocking. */
|
|
if (auth_result == AUTH_ERR || auth_result == AUTH_BLOCKED) {
|
|
return;
|
|
}
|
|
}
|
|
|
|
/* At this point we need to be authenticated to continue. */
|
|
if (!c->flag.authenticated) {
|
|
addReplyError(c, "-NOAUTH HELLO must be called with the client already "
|
|
"authenticated, otherwise the HELLO <proto> AUTH <user> <pass> "
|
|
"option can be used to authenticate the client and "
|
|
"select the RESP protocol version at the same time");
|
|
return;
|
|
}
|
|
|
|
/* Now that we're authenticated, set the client name. */
|
|
if (clientname) clientSetName(c, clientname, NULL);
|
|
|
|
/* Let's switch to the specified RESP mode. */
|
|
if (ver) c->resp = ver;
|
|
addReplyMapLen(c, 6 + !server.sentinel_mode + (sdslen(server.availability_zone) != 0));
|
|
|
|
addReplyBulkCString(c, "server");
|
|
addReplyBulkCString(c, server.extended_redis_compat ? "redis" : SERVER_NAME);
|
|
|
|
addReplyBulkCString(c, "version");
|
|
addReplyBulkCString(c, server.extended_redis_compat ? REDIS_VERSION : VALKEY_VERSION);
|
|
|
|
addReplyBulkCString(c, "proto");
|
|
addReplyLongLong(c, c->resp);
|
|
|
|
addReplyBulkCString(c, "id");
|
|
addReplyLongLong(c, c->id);
|
|
|
|
addReplyBulkCString(c, "mode");
|
|
if (server.sentinel_mode)
|
|
addReplyBulkCString(c, "sentinel");
|
|
else if (server.cluster_enabled)
|
|
addReplyBulkCString(c, "cluster");
|
|
else
|
|
addReplyBulkCString(c, "standalone");
|
|
|
|
if (!server.sentinel_mode) {
|
|
addReplyBulkCString(c, "role");
|
|
addReplyBulkCString(c, server.primary_host ? "replica" : "master");
|
|
}
|
|
|
|
addReplyBulkCString(c, "modules");
|
|
addReplyLoadedModules(c);
|
|
|
|
if (sdslen(server.availability_zone) != 0) {
|
|
addReplyBulkCString(c, "availability_zone");
|
|
addReplyBulkCBuffer(c, server.availability_zone, sdslen(server.availability_zone));
|
|
}
|
|
}
|
|
|
|
/* This callback is bound to POST and "Host:" command names. Those are not
|
|
* really commands, but are used in security attacks in order to talk to
|
|
* instances via HTTP, with a technique called "cross protocol scripting"
|
|
* which exploits the fact that services like this server will discard invalid
|
|
* HTTP headers and will process what follows.
|
|
*
|
|
* As a protection against this attack, the server will terminate the connection
|
|
* when a POST or "Host:" header is seen, and will log the event from
|
|
* time to time (to avoid creating a DOS as a result of too many logs). */
|
|
void securityWarningCommand(client *c) {
|
|
static time_t logged_time = 0;
|
|
time_t now = time(NULL);
|
|
|
|
if (llabs(now - logged_time) > 60) {
|
|
char ip[NET_IP_STR_LEN];
|
|
int port;
|
|
if (connAddrPeerName(c->conn, ip, sizeof(ip), &port) == -1) {
|
|
serverLog(LL_WARNING,
|
|
"Possible SECURITY ATTACK detected. It looks like somebody is sending POST or Host: "
|
|
"commands to %s. This is likely due to an attacker attempting to use Cross "
|
|
"Protocol Scripting to compromise your %s instance. Connection aborted.",
|
|
SERVER_TITLE, SERVER_TITLE);
|
|
} else {
|
|
serverLog(LL_WARNING,
|
|
"Possible SECURITY ATTACK detected. It looks like somebody is sending POST or Host: commands to "
|
|
"%s. This is likely due to an attacker attempting to use Cross Protocol Scripting to "
|
|
"compromise your %s instance. Connection from %s:%d aborted.",
|
|
SERVER_TITLE, SERVER_TITLE, ip, port);
|
|
}
|
|
logged_time = now;
|
|
}
|
|
freeClientAsync(c);
|
|
}
|
|
|
|
/* This function preserves the original command arguments for accurate commandlog recording.
|
|
*
|
|
* It performs the following operations:
|
|
* - Stores the initial command vector if not already saved
|
|
* - Manages memory allocation for command argument modifications
|
|
*
|
|
* new_argc - The new number of arguments to allocate space for if necessary.
|
|
* new_argv - Optional pointer to a new argument vector. If NULL, space will be
|
|
* allocated for new_argc arguments, preserving the existing arguments.
|
|
*/
|
|
static void backupAndUpdateClientArgv(client *c, int new_argc, robj **new_argv) {
|
|
robj **old_argv = c->argv;
|
|
int old_argc = c->argc;
|
|
|
|
/* Store original arguments if not already saved */
|
|
if (!c->original_argv) {
|
|
c->original_argc = old_argc;
|
|
c->original_argv = old_argv;
|
|
}
|
|
|
|
/* Handle direct argv replacement */
|
|
if (new_argv) {
|
|
c->argv = new_argv;
|
|
} else if (c->original_argv == old_argv || new_argc > old_argc) {
|
|
/* Allocate new array if necessary */
|
|
c->argv = zmalloc(sizeof(robj *) * new_argc);
|
|
|
|
for (int i = 0; i < old_argc && i < new_argc; i++) {
|
|
c->argv[i] = old_argv[i];
|
|
incrRefCount(c->argv[i]);
|
|
}
|
|
|
|
/* Initialize new argument slots to NULL */
|
|
for (int i = old_argc; i < new_argc; i++) {
|
|
c->argv[i] = NULL;
|
|
}
|
|
}
|
|
|
|
c->argc = new_argc;
|
|
c->argv_len = new_argc;
|
|
|
|
/* Clean up old argv if necessary */
|
|
if (c->argv != old_argv && c->original_argv != old_argv) {
|
|
for (int i = 0; i < old_argc; i++) {
|
|
if (old_argv[i]) decrRefCount(old_argv[i]);
|
|
}
|
|
zfree(old_argv);
|
|
}
|
|
}
|
|
|
|
/* Redact a given argument to prevent it from being shown
|
|
* in the commandlog. This information is stored in the
|
|
* original_argv array. */
|
|
void redactClientCommandArgument(client *c, int argc) {
|
|
backupAndUpdateClientArgv(c, c->argc, NULL);
|
|
if (c->original_argv[argc] == shared.redacted) {
|
|
/* This argument has already been redacted */
|
|
return;
|
|
}
|
|
decrRefCount(c->original_argv[argc]);
|
|
c->original_argv[argc] = shared.redacted;
|
|
}
|
|
|
|
/* Rewrite the command vector of the client. All the new objects ref count
|
|
* is incremented. The old command vector is freed, and the old objects
|
|
* ref count is decremented. */
|
|
void rewriteClientCommandVector(client *c, int argc, ...) {
|
|
va_list ap;
|
|
int j;
|
|
robj **argv; /* The new argument vector */
|
|
|
|
argv = zmalloc(sizeof(robj *) * argc);
|
|
va_start(ap, argc);
|
|
for (j = 0; j < argc; j++) {
|
|
robj *a;
|
|
|
|
a = va_arg(ap, robj *);
|
|
argv[j] = a;
|
|
incrRefCount(a);
|
|
}
|
|
replaceClientCommandVector(c, argc, argv);
|
|
va_end(ap);
|
|
}
|
|
|
|
/* Completely replace the client command vector with the provided one. */
|
|
void replaceClientCommandVector(client *c, int argc, robj **argv) {
|
|
backupAndUpdateClientArgv(c, argc, argv);
|
|
c->argv_len_sum = 0;
|
|
c->flag.buffered_reply = 0;
|
|
for (int j = 0; j < c->argc; j++)
|
|
if (c->argv[j]) c->argv_len_sum += getStringObjectLen(c->argv[j]);
|
|
c->cmd = lookupCommandOrOriginal(c->argv, c->argc);
|
|
serverAssertWithInfo(c, NULL, c->cmd != NULL);
|
|
}
|
|
|
|
/* Rewrite a single item in the command vector.
|
|
* The new val ref count is incremented, and the old decremented.
|
|
*
|
|
* It is possible to specify an argument over the current size of the
|
|
* argument vector: in this case the array of objects gets reallocated
|
|
* and c->argc set to the max value. However it's up to the caller to
|
|
*
|
|
* 1. Make sure there are no "holes" and all the arguments are set.
|
|
* 2. If the original argument vector was longer than the one we
|
|
* want to end with, it's up to the caller to set c->argc and
|
|
* free the no longer used objects on c->argv. */
|
|
void rewriteClientCommandArgument(client *c, int i, robj *newval) {
|
|
robj *oldval;
|
|
int new_argc = (i >= c->argc) ? i + 1 : c->argc;
|
|
backupAndUpdateClientArgv(c, new_argc, NULL);
|
|
|
|
oldval = c->argv[i];
|
|
if (oldval) c->argv_len_sum -= getStringObjectLen(oldval);
|
|
if (newval) c->argv_len_sum += getStringObjectLen(newval);
|
|
c->argv[i] = newval;
|
|
incrRefCount(newval);
|
|
if (oldval) decrRefCount(oldval);
|
|
|
|
/* If this is the command name make sure to fix c->cmd. */
|
|
if (i == 0) {
|
|
c->flag.buffered_reply = 0;
|
|
c->cmd = lookupCommandOrOriginal(c->argv, c->argc);
|
|
serverAssertWithInfo(c, NULL, c->cmd != NULL);
|
|
}
|
|
}
|
|
|
|
/* This function returns the number of bytes that the server is
|
|
* using to store the reply still not read by the client.
|
|
*
|
|
* Note: this function is very fast so can be called as many time as
|
|
* the caller wishes. The main usage of this function currently is
|
|
* enforcing the client output length limits. */
|
|
size_t getClientOutputBufferMemoryUsage(client *c) {
|
|
if (getClientType(c) == CLIENT_TYPE_REPLICA) {
|
|
size_t repl_buf_size = 0;
|
|
size_t repl_node_num = 0;
|
|
size_t repl_node_size = sizeof(listNode) + sizeof(replBufBlock);
|
|
if (c->repl_data->ref_repl_buf_node) {
|
|
replBufBlock *last = listNodeValue(listLast(server.repl_buffer_blocks));
|
|
replBufBlock *cur = listNodeValue(c->repl_data->ref_repl_buf_node);
|
|
repl_buf_size = last->repl_offset + last->size - cur->repl_offset;
|
|
repl_node_num = last->id - cur->id + 1;
|
|
}
|
|
return repl_buf_size + (repl_node_size * repl_node_num);
|
|
}
|
|
|
|
size_t list_item_size = sizeof(listNode) + sizeof(clientReplyBlock);
|
|
size_t usage = c->reply_bytes + (list_item_size * listLength(c->reply));
|
|
if (isDeferredReplyEnabled(c)) {
|
|
usage += c->deferred_reply_bytes +
|
|
(list_item_size * listLength(c->deferred_reply));
|
|
}
|
|
return usage;
|
|
}
|
|
|
|
/* Returns the total client's memory usage.
|
|
* Optionally, if output_buffer_mem_usage is not NULL, it fills it with
|
|
* the client output buffer memory usage portion of the total. */
|
|
size_t getClientMemoryUsage(client *c, size_t *output_buffer_mem_usage) {
|
|
size_t mem = getClientOutputBufferMemoryUsage(c);
|
|
|
|
if (output_buffer_mem_usage != NULL) *output_buffer_mem_usage = mem;
|
|
mem += c->querybuf ? sdsAllocSize(c->querybuf) : 0;
|
|
mem += zmalloc_size(c);
|
|
mem += c->buf_usable_size;
|
|
/* For efficiency (less work keeping track of the argv memory), it doesn't include the used memory
|
|
* i.e. unused sds space and internal fragmentation, just the string length. but this is enough to
|
|
* spot problematic clients. */
|
|
mem += c->argv_len_sum + sizeof(robj *) * c->argc;
|
|
mem += multiStateMemOverhead(c);
|
|
|
|
/* Add memory overhead of pubsub channels and patterns. Note: this is just the overhead of the robj pointers
|
|
* to the strings themselves because they aren't stored per client. */
|
|
mem += pubsubMemOverhead(c);
|
|
|
|
/* Add memory overhead of the tracking prefixes, this is an underestimation so we don't need to traverse the entire
|
|
* rax */
|
|
if (c->pubsub_data && c->pubsub_data->client_tracking_prefixes)
|
|
mem += c->pubsub_data->client_tracking_prefixes->numnodes * (sizeof(raxNode) + sizeof(raxNode *));
|
|
|
|
return mem;
|
|
}
|
|
|
|
/* Get the class of a client, used in order to enforce limits to different
|
|
* classes of clients.
|
|
*
|
|
* The function will return one of the following:
|
|
* CLIENT_TYPE_NORMAL -> Normal client, including MONITOR
|
|
* CLIENT_TYPE_REPLICA -> replica
|
|
* CLIENT_TYPE_PUBSUB -> Client subscribed to Pub/Sub channels
|
|
* CLIENT_TYPE_PRIMARY -> The client representing our replication primary.
|
|
*/
|
|
int getClientType(client *c) {
|
|
if (c->flag.primary) return CLIENT_TYPE_PRIMARY;
|
|
/* Even though MONITOR clients are marked as replicas, we
|
|
* want the expose them as normal clients. */
|
|
if (c->flag.replica && !c->flag.monitor) return CLIENT_TYPE_REPLICA;
|
|
if (c->flag.pubsub) return CLIENT_TYPE_PUBSUB;
|
|
if (c->slot_migration_job) return isImportSlotMigrationJob(c->slot_migration_job) ? CLIENT_TYPE_SLOT_IMPORT : CLIENT_TYPE_SLOT_EXPORT;
|
|
return CLIENT_TYPE_NORMAL;
|
|
}
|
|
|
|
int getClientTypeByName(char *name) {
|
|
if (!strcasecmp(name, "normal"))
|
|
return CLIENT_TYPE_NORMAL;
|
|
else if (!strcasecmp(name, "slave"))
|
|
return CLIENT_TYPE_REPLICA;
|
|
else if (!strcasecmp(name, "replica"))
|
|
return CLIENT_TYPE_REPLICA;
|
|
else if (!strcasecmp(name, "pubsub"))
|
|
return CLIENT_TYPE_PUBSUB;
|
|
else if (!strcasecmp(name, "master") || !strcasecmp(name, "primary"))
|
|
return CLIENT_TYPE_PRIMARY;
|
|
else
|
|
return -1;
|
|
}
|
|
|
|
char *getClientTypeName(int class) {
|
|
switch (class) {
|
|
case CLIENT_TYPE_NORMAL: return "normal";
|
|
case CLIENT_TYPE_REPLICA: return "slave";
|
|
case CLIENT_TYPE_PUBSUB: return "pubsub";
|
|
case CLIENT_TYPE_PRIMARY: return "master";
|
|
case CLIENT_TYPE_SLOT_IMPORT: return "slot-import";
|
|
case CLIENT_TYPE_SLOT_EXPORT: return "slot-export";
|
|
default: return NULL;
|
|
}
|
|
}
|
|
|
|
/* The function checks if the client reached output buffer soft or hard
|
|
* limit, and also update the state needed to check the soft limit as
|
|
* a side effect.
|
|
*
|
|
* Return value: non-zero if the client reached the soft or the hard limit.
|
|
* Otherwise zero is returned. */
|
|
int checkClientOutputBufferLimits(client *c) {
|
|
int soft = 0, hard = 0, class;
|
|
unsigned long used_mem = getClientOutputBufferMemoryUsage(c);
|
|
|
|
/* For unauthenticated clients which were also never authenticated before the output buffer is limited to prevent
|
|
* them from abusing it by not reading the replies */
|
|
if (used_mem > REPLY_BUFFER_SIZE_UNAUTHENTICATED_CLIENT && authRequired(c) && !clientEverAuthenticated(c))
|
|
return 1;
|
|
|
|
class = getClientType(c);
|
|
/* For the purpose of output buffer limiting, primaries are handled
|
|
* like normal clients. */
|
|
if (class == CLIENT_TYPE_PRIMARY) class = CLIENT_TYPE_NORMAL;
|
|
|
|
/* Slot import clients are treated as normal as well */
|
|
if (class == CLIENT_TYPE_SLOT_IMPORT) class = CLIENT_TYPE_NORMAL;
|
|
|
|
/* Slot export clients are treated as replicas */
|
|
if (class == CLIENT_TYPE_SLOT_EXPORT) class = CLIENT_TYPE_REPLICA;
|
|
|
|
/* Note that it doesn't make sense to set the replica clients output buffer
|
|
* limit lower than the repl-backlog-size config (partial sync will succeed
|
|
* and then replica will get disconnected).
|
|
* Such a configuration is ignored (the size of repl-backlog-size will be used).
|
|
* This doesn't have memory consumption implications since the replica client
|
|
* will share the backlog buffers memory. */
|
|
size_t hard_limit_bytes = server.client_obuf_limits[class].hard_limit_bytes;
|
|
size_t soft_limit_bytes = server.client_obuf_limits[class].soft_limit_bytes;
|
|
if (class == CLIENT_TYPE_REPLICA && hard_limit_bytes && (long long)hard_limit_bytes < server.repl_backlog_size)
|
|
hard_limit_bytes = server.repl_backlog_size;
|
|
if (class == CLIENT_TYPE_REPLICA && soft_limit_bytes && (long long)soft_limit_bytes < server.repl_backlog_size)
|
|
soft_limit_bytes = server.repl_backlog_size;
|
|
if (server.client_obuf_limits[class].hard_limit_bytes && used_mem >= hard_limit_bytes) hard = 1;
|
|
if (server.client_obuf_limits[class].soft_limit_bytes && used_mem >= soft_limit_bytes) soft = 1;
|
|
|
|
/* We need to check if the soft limit is reached continuously for the
|
|
* specified amount of seconds. */
|
|
if (soft) {
|
|
if (c->obuf_soft_limit_reached_time == 0) {
|
|
c->obuf_soft_limit_reached_time = server.unixtime;
|
|
soft = 0; /* First time we see the soft limit reached */
|
|
} else {
|
|
time_t elapsed = server.unixtime - c->obuf_soft_limit_reached_time;
|
|
|
|
if (elapsed <= server.client_obuf_limits[class].soft_limit_seconds) {
|
|
soft = 0; /* The client still did not reached the max number of
|
|
seconds for the soft limit to be considered
|
|
reached. */
|
|
}
|
|
}
|
|
} else {
|
|
c->obuf_soft_limit_reached_time = 0;
|
|
}
|
|
return soft || hard;
|
|
}
|
|
|
|
/* Asynchronously close a client if soft or hard limit is reached on the
|
|
* output buffer size. The caller can check if the client will be closed
|
|
* checking if the client CLIENT_CLOSE_ASAP flag is set.
|
|
*
|
|
* Note: we need to close the client asynchronously because this function is
|
|
* called from contexts where the client can't be freed safely, i.e. from the
|
|
* lower level functions pushing data inside the client output buffers.
|
|
* When `async` is set to 0, we close the client immediately, this is
|
|
* useful when called from cron.
|
|
*
|
|
* Returns 1 if client was (flagged) closed. */
|
|
int closeClientOnOutputBufferLimitReached(client *c, int async) {
|
|
if (c->flag.fake) return 0; /* It is unsafe to free fake clients. */
|
|
serverAssert(c->conn);
|
|
serverAssert(c->reply_bytes < SIZE_MAX - (1024 * 64));
|
|
/* Note that c->reply_bytes is irrelevant for replica clients
|
|
* (they use the global repl buffers). */
|
|
if ((c->reply_bytes == 0 && getClientType(c) != CLIENT_TYPE_REPLICA) ||
|
|
(c->flag.close_asap && !(c->flag.protected_rdb_channel)))
|
|
return 0;
|
|
if (checkClientOutputBufferLimits(c)) {
|
|
sds client = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log);
|
|
/* Remove RDB connection protection on COB overrun */
|
|
|
|
if (async || c->flag.protected_rdb_channel) {
|
|
c->flag.protected_rdb_channel = 0;
|
|
freeClientAsync(c);
|
|
serverLog(LL_WARNING, "Client %s scheduled to be closed ASAP for overcoming of output buffer limits.",
|
|
client);
|
|
} else {
|
|
freeClient(c);
|
|
serverLog(LL_WARNING, "Client %s closed for overcoming of output buffer limits.", client);
|
|
}
|
|
sdsfree(client);
|
|
server.stat_client_outbuf_limit_disconnections++;
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/* Helper function used by performEvictions() in order to flush replicas
|
|
* output buffers without returning control to the event loop.
|
|
* This is also called by SHUTDOWN for a best-effort attempt to send
|
|
* replicas the latest writes. */
|
|
void flushReplicasOutputBuffers(void) {
|
|
listIter li;
|
|
listNode *ln;
|
|
|
|
listRewind(server.replicas, &li);
|
|
while ((ln = listNext(&li))) {
|
|
client *replica = listNodeValue(ln);
|
|
int can_receive_writes = connHasWriteHandler(replica->conn) || (replica->flag.pending_write);
|
|
|
|
/* We don't want to send the pending data to the replica in a few
|
|
* cases:
|
|
*
|
|
* 1. For some reason there is neither the write handler installed
|
|
* nor the client is flagged as to have pending writes: for some
|
|
* reason this replica may not be set to receive data. This is
|
|
* just for the sake of defensive programming.
|
|
*
|
|
* 2. The put_online_on_ack flag is true. To know why we don't want
|
|
* to send data to the replica in this case, please grep for the
|
|
* flag for this flag.
|
|
*
|
|
* 3. Obviously if the replica is not ONLINE.
|
|
*/
|
|
if (isReplicaReadyForReplData(replica) && !(replica->flag.close_asap) && can_receive_writes &&
|
|
!replica->repl_data->repl_start_cmd_stream_on_ack && clientHasPendingReplies(replica)) {
|
|
writeToClient(replica);
|
|
}
|
|
}
|
|
}
|
|
|
|
char *getPausedReason(pause_purpose purpose) {
|
|
switch (purpose) {
|
|
case PAUSE_BY_CLIENT_COMMAND:
|
|
return "client_pause";
|
|
case PAUSE_DURING_SHUTDOWN:
|
|
return "shutdown_in_progress";
|
|
case PAUSE_DURING_FAILOVER:
|
|
return "failover_in_progress";
|
|
case PAUSE_DURING_SLOT_MIGRATION:
|
|
return "slot_migration_in_progress";
|
|
case NUM_PAUSE_PURPOSES:
|
|
return "none";
|
|
default:
|
|
return "Unknown pause reason";
|
|
}
|
|
}
|
|
|
|
mstime_t getPausedActionTimeout(uint32_t action, pause_purpose *purpose) {
|
|
mstime_t timeout = 0;
|
|
*purpose = NUM_PAUSE_PURPOSES;
|
|
for (int i = 0; i < NUM_PAUSE_PURPOSES; i++) {
|
|
pause_event *p = &(server.client_pause_per_purpose[i]);
|
|
if (p->paused_actions & action && (p->end - server.mstime) > timeout) {
|
|
timeout = p->end - server.mstime;
|
|
*purpose = i;
|
|
}
|
|
}
|
|
return timeout;
|
|
}
|
|
|
|
/* Compute current paused actions and its end time, aggregated for
|
|
* all pause purposes. */
|
|
void updatePausedActions(void) {
|
|
uint32_t prev_paused_actions = server.paused_actions;
|
|
server.paused_actions = 0;
|
|
|
|
for (int i = 0; i < NUM_PAUSE_PURPOSES; i++) {
|
|
pause_event *p = &(server.client_pause_per_purpose[i]);
|
|
if (p->end > server.mstime)
|
|
server.paused_actions |= p->paused_actions;
|
|
else {
|
|
p->paused_actions = 0;
|
|
p->end = 0;
|
|
}
|
|
}
|
|
|
|
/* If the pause type is less restrictive than before, we unblock all clients
|
|
* so they are reprocessed (may get re-paused). */
|
|
uint32_t mask_cli = (PAUSE_ACTION_CLIENT_WRITE | PAUSE_ACTION_CLIENT_ALL);
|
|
if ((server.paused_actions & mask_cli) < (prev_paused_actions & mask_cli)) {
|
|
unblockPostponedClients();
|
|
}
|
|
}
|
|
|
|
/* Unblock all paused clients (ones that where blocked by BLOCKED_POSTPONE (possibly in processCommand).
|
|
* This means they'll get re-processed in beforeSleep, and may get paused again if needed. */
|
|
void unblockPostponedClients(void) {
|
|
listNode *ln;
|
|
listIter li;
|
|
listRewind(server.postponed_clients, &li);
|
|
while ((ln = listNext(&li)) != NULL) {
|
|
client *c = listNodeValue(ln);
|
|
unblockClient(c, 1);
|
|
}
|
|
}
|
|
|
|
/* Set pause-client end-time and restricted action. If already paused, then:
|
|
* 1. Keep higher end-time value between configured and the new one
|
|
* 2. Keep most restrictive action between configured and the new one */
|
|
static void pauseClientsByClient(mstime_t endTime, int isPauseClientAll) {
|
|
uint32_t actions;
|
|
pause_event *p = &server.client_pause_per_purpose[PAUSE_BY_CLIENT_COMMAND];
|
|
|
|
if (isPauseClientAll)
|
|
actions = PAUSE_ACTIONS_CLIENT_ALL_SET;
|
|
else {
|
|
actions = PAUSE_ACTIONS_CLIENT_WRITE_SET;
|
|
/* If currently configured most restrictive client pause, then keep it */
|
|
if (p->paused_actions & PAUSE_ACTION_CLIENT_ALL) actions = PAUSE_ACTIONS_CLIENT_ALL_SET;
|
|
}
|
|
|
|
pauseActions(PAUSE_BY_CLIENT_COMMAND, endTime, actions);
|
|
}
|
|
|
|
/* Pause actions up to the specified unixtime (in ms) for a given type of
|
|
* purpose.
|
|
*
|
|
* A main use case of this function is to allow pausing replication traffic
|
|
* so that a failover without data loss to occur. Replicas will continue to receive
|
|
* traffic to facilitate this functionality.
|
|
*
|
|
* This function is also internally used by Cluster for the manual
|
|
* failover procedure implemented by CLUSTER FAILOVER.
|
|
*
|
|
* The function always succeed, even if there is already a pause in progress.
|
|
* The new paused_actions of a given 'purpose' will override the old ones and
|
|
* end time will be updated if new end time is bigger than currently configured */
|
|
void pauseActions(pause_purpose purpose, mstime_t end, uint32_t actions) {
|
|
/* Manage pause type and end time per pause purpose. */
|
|
server.client_pause_per_purpose[purpose].paused_actions = actions;
|
|
|
|
/* If currently configured end time bigger than new one, then keep it */
|
|
if (server.client_pause_per_purpose[purpose].end < end) server.client_pause_per_purpose[purpose].end = end;
|
|
|
|
updatePausedActions();
|
|
|
|
/* We allow write commands that were queued
|
|
* up before and after to execute. We need
|
|
* to track this state so that we don't assert
|
|
* in propagateNow(). */
|
|
if (server.in_exec) {
|
|
server.client_pause_in_transaction = 1;
|
|
}
|
|
}
|
|
|
|
/* Unpause actions and queue them for reprocessing. */
|
|
void unpauseActions(pause_purpose purpose) {
|
|
server.client_pause_per_purpose[purpose].end = 0;
|
|
server.client_pause_per_purpose[purpose].paused_actions = 0;
|
|
updatePausedActions();
|
|
}
|
|
|
|
/* Returns bitmask of paused actions */
|
|
uint32_t isPausedActions(uint32_t actions_bitmask) {
|
|
return (server.paused_actions & actions_bitmask);
|
|
}
|
|
|
|
/* Returns bitmask of paused actions */
|
|
uint32_t isPausedActionsWithUpdate(uint32_t actions_bitmask) {
|
|
if (!(server.paused_actions & actions_bitmask)) return 0;
|
|
updatePausedActions();
|
|
return (server.paused_actions & actions_bitmask);
|
|
}
|
|
|
|
uint32_t getPausedActionsWithPurpose(pause_purpose purpose) {
|
|
return server.client_pause_per_purpose[purpose].paused_actions;
|
|
}
|
|
|
|
/* This function is called by the server in order to process a few events from
|
|
* time to time while blocked into some not interruptible operation.
|
|
* This allows to reply to clients with the -LOADING error while loading the
|
|
* data set at startup or after a full resynchronization with the primary
|
|
* and so forth.
|
|
*
|
|
* It calls the event loop in order to process a few events. Specifically we
|
|
* try to call the event loop 4 times as long as we receive acknowledge that
|
|
* some event was processed, in order to go forward with the accept, read,
|
|
* write, close sequence needed to serve a client.
|
|
*
|
|
* The function returns the total number of events processed. */
|
|
void processEventsWhileBlocked(void) {
|
|
int iterations = 4; /* See the function top-comment. */
|
|
|
|
/* Update our cached time since it is used to create and update the last
|
|
* interaction time with clients and for other important things. */
|
|
updateCachedTime(0);
|
|
|
|
/* For the few commands that are allowed during busy scripts, we rather
|
|
* provide a fresher time than the one from when the script started (they
|
|
* still won't get it from the call due to execution_nesting. For commands
|
|
* during loading this doesn't matter. */
|
|
mstime_t prev_cmd_time_snapshot = server.cmd_time_snapshot;
|
|
server.cmd_time_snapshot = server.mstime;
|
|
|
|
/* Note: when we are processing events while blocked (for instance during
|
|
* busy Lua scripts), we set a global flag. When such flag is set, we
|
|
* avoid handling the read part of clients using threaded I/O.
|
|
* See https://github.com/redis/redis/issues/6988 for more info.
|
|
* Note that there could be cases of nested calls to this function,
|
|
* specifically on a busy script during async_loading rdb, and scripts
|
|
* that came from AOF. */
|
|
ProcessingEventsWhileBlocked++;
|
|
while (iterations--) {
|
|
long long startval = server.events_processed_while_blocked;
|
|
long long ae_events =
|
|
aeProcessEvents(server.el, AE_FILE_EVENTS | AE_DONT_WAIT | AE_CALL_BEFORE_SLEEP | AE_CALL_AFTER_SLEEP);
|
|
/* Note that server.events_processed_while_blocked will also get
|
|
* incremented by callbacks called by the event loop handlers. */
|
|
server.events_processed_while_blocked += ae_events;
|
|
long long events = server.events_processed_while_blocked - startval;
|
|
if (!events) break;
|
|
}
|
|
|
|
whileBlockedCron();
|
|
|
|
ProcessingEventsWhileBlocked--;
|
|
serverAssert(ProcessingEventsWhileBlocked >= 0);
|
|
|
|
server.cmd_time_snapshot = prev_cmd_time_snapshot;
|
|
}
|
|
|
|
/* Return 1 if the client read is handled using threaded I/O.
|
|
* 0 otherwise. */
|
|
int postponeClientRead(client *c) {
|
|
if (ProcessingEventsWhileBlocked) return 0;
|
|
|
|
return (trySendReadToIOThreads(c) == C_OK);
|
|
}
|
|
|
|
int processIOThreadsReadDone(void) {
|
|
if (ProcessingEventsWhileBlocked) {
|
|
/* When ProcessingEventsWhileBlocked we may call processIOThreadsReadDone recursively.
|
|
* In this case, there may be some clients left in the batch waiting to be processed. */
|
|
processClientsCommandsBatch();
|
|
}
|
|
|
|
if (listLength(server.clients_pending_io_read) == 0) return 0;
|
|
int processed = 0;
|
|
listNode *ln;
|
|
|
|
listNode *next = listFirst(server.clients_pending_io_read);
|
|
while (next) {
|
|
ln = next;
|
|
next = listNextNode(ln);
|
|
client *c = listNodeValue(ln);
|
|
|
|
/* Client is still waiting for a pending I/O - skip it */
|
|
if (c->io_write_state == CLIENT_PENDING_IO || c->io_read_state == CLIENT_PENDING_IO) continue;
|
|
/* If the write job is done, process it ASAP to free the buffer and handle connection errors */
|
|
if (c->io_write_state == CLIENT_COMPLETED_IO) {
|
|
int allow_async_writes = 0; /* Don't send writes for the client to IO threads before processing the reads */
|
|
processClientIOWriteDone(c, allow_async_writes);
|
|
}
|
|
/* memory barrier acquire to get the updated client state */
|
|
atomic_thread_fence(memory_order_acquire);
|
|
|
|
listUnlinkNode(server.clients_pending_io_read, ln);
|
|
c->flag.pending_read = 0;
|
|
c->io_read_state = CLIENT_IDLE;
|
|
|
|
/* Don't post-process-reads from clients that are going to be closed anyway. */
|
|
if (c->flag.close_asap) continue;
|
|
|
|
/* If a client is protected, don't do anything,
|
|
* that may trigger read/write error or recreate handler. */
|
|
if (c->flag.protected) continue;
|
|
|
|
processed++;
|
|
server.stat_io_reads_processed++;
|
|
|
|
/* Save the current conn state, as connUpdateState may modify it */
|
|
int in_accept_state = (connGetState(c->conn) == CONN_STATE_ACCEPTING);
|
|
connSetPostponeUpdateState(c->conn, 0);
|
|
connUpdateState(c->conn);
|
|
|
|
/* In accept state, no client's data was read - stop here. */
|
|
if (in_accept_state) continue;
|
|
|
|
/* On read error - stop here. */
|
|
if (handleReadResult(c) == C_ERR) {
|
|
continue;
|
|
}
|
|
|
|
if (!(c->read_flags & READ_FLAGS_DONT_PARSE)) {
|
|
parseResult res = handleParseResults(c);
|
|
/* On parse error - stop here. */
|
|
if (res == PARSE_ERR) {
|
|
continue;
|
|
} else if (res == PARSE_NEEDMORE) {
|
|
beforeNextClient(c);
|
|
continue;
|
|
}
|
|
}
|
|
|
|
if (c->argc > 0) {
|
|
c->flag.pending_command = 1;
|
|
}
|
|
|
|
size_t list_length_before_command_execute = listLength(server.clients_pending_io_read);
|
|
/* try to add the command to the batch */
|
|
int ret = addCommandToBatchAndProcessIfFull(c);
|
|
/* If the command was not added to the commands batch, process it immediately */
|
|
if (ret == C_ERR) {
|
|
if (processPendingCommandAndInputBuffer(c) == C_OK) beforeNextClient(c);
|
|
}
|
|
if (list_length_before_command_execute != listLength(server.clients_pending_io_read)) {
|
|
/* A client was unlink from the list possibly making the next node invalid */
|
|
next = listFirst(server.clients_pending_io_read);
|
|
}
|
|
}
|
|
|
|
processClientsCommandsBatch();
|
|
|
|
return processed;
|
|
}
|
|
|
|
/* Returns the actual client eviction limit based on current configuration or
|
|
* 0 if no limit. */
|
|
size_t getClientEvictionLimit(void) {
|
|
size_t maxmemory_clients_actual = SIZE_MAX;
|
|
|
|
/* Handle percentage of maxmemory*/
|
|
if (server.maxmemory_clients < 0 && server.maxmemory > 0) {
|
|
unsigned long long maxmemory_clients_bytes =
|
|
(unsigned long long)((double)server.maxmemory * -(double)server.maxmemory_clients / 100);
|
|
if (maxmemory_clients_bytes <= SIZE_MAX) maxmemory_clients_actual = maxmemory_clients_bytes;
|
|
} else if (server.maxmemory_clients > 0)
|
|
maxmemory_clients_actual = server.maxmemory_clients;
|
|
else
|
|
return 0;
|
|
|
|
/* Don't allow a too small maxmemory-clients to avoid cases where we can't communicate
|
|
* at all with the server because of bad configuration */
|
|
if (maxmemory_clients_actual < 1024 * 128) maxmemory_clients_actual = 1024 * 128;
|
|
|
|
return maxmemory_clients_actual;
|
|
}
|
|
|
|
void evictClients(void) {
|
|
if (!server.client_mem_usage_buckets) return;
|
|
/* Start eviction from topmost bucket (largest clients) */
|
|
int curr_bucket = CLIENT_MEM_USAGE_BUCKETS - 1;
|
|
listIter bucket_iter;
|
|
listRewind(server.client_mem_usage_buckets[curr_bucket].clients, &bucket_iter);
|
|
size_t client_eviction_limit = getClientEvictionLimit();
|
|
if (client_eviction_limit == 0) return;
|
|
while (server.stat_clients_type_memory[CLIENT_TYPE_NORMAL] + server.stat_clients_type_memory[CLIENT_TYPE_PUBSUB] >
|
|
client_eviction_limit) {
|
|
listNode *ln = listNext(&bucket_iter);
|
|
if (ln) {
|
|
client *c = ln->value;
|
|
sds ci = catClientInfoString(sdsempty(), c, server.hide_user_data_from_log);
|
|
serverLog(LL_NOTICE, "Evicting client: %s", ci);
|
|
freeClient(c);
|
|
sdsfree(ci);
|
|
server.stat_evictedclients++;
|
|
} else {
|
|
curr_bucket--;
|
|
if (curr_bucket < 0) {
|
|
serverLog(LL_WARNING, "Over client maxmemory after evicting all evictable clients");
|
|
break;
|
|
}
|
|
listRewind(server.client_mem_usage_buckets[curr_bucket].clients, &bucket_iter);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* IO threads functions */
|
|
|
|
void ioThreadReadQueryFromClient(void *data) {
|
|
client *c = data;
|
|
serverAssert(c->io_read_state == CLIENT_PENDING_IO);
|
|
|
|
/* Read */
|
|
readToQueryBuf(c);
|
|
|
|
/* Check for read errors. */
|
|
if (c->nread <= 0) {
|
|
goto done;
|
|
}
|
|
|
|
/* Skip command parsing if the READ_FLAGS_DONT_PARSE flag is set. */
|
|
if (c->read_flags & READ_FLAGS_DONT_PARSE) {
|
|
goto done;
|
|
}
|
|
|
|
/* Handle QB limit */
|
|
if (c->read_flags & READ_FLAGS_QB_LIMIT_REACHED) {
|
|
goto done;
|
|
}
|
|
|
|
parseInputBuffer(c);
|
|
trimCommandQueue(c);
|
|
prepareCommandQueue(c);
|
|
|
|
/* Parsing was not completed - let the main-thread handle it. */
|
|
if (!(c->read_flags & READ_FLAGS_PARSING_COMPLETED)) {
|
|
goto done;
|
|
}
|
|
|
|
/* Empty command - Multibulk processing could see a <= 0 length. */
|
|
if (c->argc == 0) {
|
|
goto done;
|
|
}
|
|
|
|
done:
|
|
/* Only trim query buffer for non-primary clients
|
|
* Primary client's buffer is handled by main thread using repl_applied position */
|
|
if (!(c->read_flags & READ_FLAGS_REPLICATED)) {
|
|
trimClientQueryBuffer(c);
|
|
}
|
|
atomic_thread_fence(memory_order_release);
|
|
c->io_read_state = CLIENT_COMPLETED_IO;
|
|
}
|
|
|
|
void ioThreadWriteToClient(void *data) {
|
|
client *c = data;
|
|
serverAssert(c->io_write_state == CLIENT_PENDING_IO);
|
|
c->nwritten = 0;
|
|
if (c->write_flags & WRITE_FLAGS_IS_REPLICA) {
|
|
writeToReplica(c);
|
|
} else {
|
|
_writeToClient(c);
|
|
}
|
|
|
|
atomic_thread_fence(memory_order_release);
|
|
c->io_write_state = CLIENT_COMPLETED_IO;
|
|
}
|