diff --git a/.doubanpde/Makefile b/.doubanpde/Makefile new file mode 100644 index 0000000..3409322 --- /dev/null +++ b/.doubanpde/Makefile @@ -0,0 +1,27 @@ +SHELL := /bin/bash +PROJECT_DIR := /home/project + +env: + dpi -y -D "-y" + mkdir -p /tmp/gobeansproxy_prefix/proxy/ + +build: + go build -o gobeansproxy_bin + +start-proxy: build + ./gobeansproxy_bin -confdir .doubanpde/scripts/bdb/gobeansproxy/prefix-switch-cfg/conf/ + +start-riven-proxy: build + ./gobeansproxy_bin -confdir .doubanpde/scripts/bdb/rivenbeansproxy/conf/ + +start-proxy-gc-trace: build + GODEBUG=gctrace=1 ./gobeansproxy_bin -confdir .doubanpde/scripts/bdb/gobeansproxy/prefix-switch-cfg/conf/ + +start-proxy-valgrind: build + G_SLICE=always-malloc G_DEBUG=gc-friendly valgrind -v --tool=memcheck --leak-check=full --num-callers=40 --error-limit=no --log-file=valgrind.log ./gobeansproxy_bin -confdir .doubanpde/scripts/bdb/gobeansproxy/prefix-switch-cfg/conf/ + +tail-log: + tail -f /tmp/gobeansproxy_prefix/proxy/*.log + +cqlsh: + cqlsh -u cassandra -p cassandra diff --git a/.doubanpde/pde.yaml b/.doubanpde/pde.yaml new file mode 100644 index 0000000..a209f23 --- /dev/null +++ b/.doubanpde/pde.yaml @@ -0,0 +1,126 @@ +apiVersion: v1 +kind: Pod +metadata: + labels: + app: "gobeansproxy" + createId: "{{ uuid }}" + createdBy: pdectl + createdByUser: wangqiang + runByUser: '{{ .CliArgs.String "username" }}' + runByPdectlVersion: "{{ .CliArgs.App.Version }}" + runnerAddress: "{{ .RunnerAddress }}" + createdTime: "{{ .CreatedTime }}" + pdeVersion: "v0.1.4" + useWebEditor: "false" + webEditorPort: 0 + webEditorType: "" + name: "gobeansproxy" + annotations: + pdectl.douban.com/cfg/exec-cmd: '{{ .CliArgs.String "exec-default-cmd" }}' +spec: + containers: + - name: "main" + env: + - name: HOSTNAME + value: "gobeansproxy-main" + - name: SCRIBE_HOST + value: 10.0.2.2 + image: "docker.douban/sa/pde-go-cli:latest-1.20-v2" + ports: + volumeMounts: + # mount go path src to container go path + - mountPath: /go/src/ + name: go-path-src + # mount code folder + - mountPath: /home/project/ + name: code + - mountPath: /root/ + name: userhome + - mountPath: '/home/{{ .CliArgs.String "username" }}' + name: userhome + - mountPath: /fuse:rslave + name: fuse + - mountPath: /etc/douban/ + name: etc-douban + readOnly: true + - mountPath: /etc/localtime + name: etc-localtime + readOnly: true + - mountPath: /var/run/nscd/ + name: var-run-nscd + readOnly: true + workingDir: /home/project + # - name: mc + # image: docker.douban/memcached:latest + # workingDir: / + {{- range (mkSlice 57980 57981 57982 57983) }} + - name: beansdb-{{ . }} + image: docker.douban/platform/gobeansdb:latest + workingDir: /data/ + volumeMounts: + - mountPath: /data + name: beansdb-{{ . }}-data-dir + - mountPath: /gobeansdb/default_beansdb_cfg/ + name: beansdb-{{ . }}-cfg-dir + {{- end }} + - name: cassandra + image: docker.douban/dba/cassandra:4.1.2 + workingDir: / + volumeMounts: + - mountPath: /var/lib/cassandra/ + name: cassandra-data-dir + # - mountPath: /tmp/cassandra/ + # name: cassandra-cfg + # command: + # - "/bin/bash" + # args: + # - "-c" + # - > + # cp -rfv /tmp/cassandra/cassandra.yaml /etc/cassandra/ && + # /usr/local/bin/docker-entrypoint.sh cassandra -f + restartPolicy: Never + volumes: + - hostPath: + path: '{{ expandEnvVar "$GOPATH/src" }}' + type: Directory + name: go-path-src + {{- $env := . }} + {{- range (mkSlice 57980 57981 57982 57983) }} + - hostPath: + path: '{{ $env.CliArgs.String "project-dir" }}/.doubanpde/data/beansdb-{{ . }}/' + type: DirectoryOrCreate + name: beansdb-{{ . }}-data-dir + - hostPath: + path: '{{ $env.CliArgs.String "project-dir" }}/.doubanpde/scripts/bdb/gobeansproxy/{{ . }}/conf/' + type: Directory + name: beansdb-{{ . }}-cfg-dir + {{- end }} + - hostPath: + path: '{{ .CliArgs.String "project-dir" }}/.doubanpde/data/cassandra/' + type: DirectoryOrCreate + name: cassandra-data-dir + - hostPath: + path: '{{ .CliArgs.String "project-dir" }}/.doubanpde/scripts/cassandra/' + name: cassandra-cfg + - hostPath: + path: '{{ .CliArgs.String "project-dir" }}' + type: Directory + name: code + - hostPath: + path: '{{ expandEnvVar "$HOME/" }}' + type: Directory + name: userhome + - hostPath: + path: /fuse + type: Directory + name: fuse + - hostPath: + path: /etc/douban/ + name: etc-douban + - hostPath: + path: /etc/localtime + name: etc-localtime + - hostPath: + path: /var/run/nscd/ + name: var-run-nscd + diff --git a/.doubanpde/scripts/bdb/gobeansproxy/57980/conf/global.yaml b/.doubanpde/scripts/bdb/gobeansproxy/57980/conf/global.yaml new file mode 100644 index 0000000..02dc309 --- /dev/null +++ b/.doubanpde/scripts/bdb/gobeansproxy/57980/conf/global.yaml @@ -0,0 +1,32 @@ +hstore: + data: + check_vhash: true + datafile_max_str: 4000M + flush_interval: 60 + flush_wake_str: 10M + no_gc_days: 7 + hint: + hint_index_interval_str: 32K + hint_merge_interval: 5 + hint_no_merged: true + hint_split_cap_str: 1M + htree: + tree_height: 3 + local: + home: /data +mc: + body_big_str: 5M + body_c_str: 0K + body_max_str: 50M + flush_max_str: 100M + max_key_len: 250 + max_req: 16 +server: + accesslog: /tmp/access.log + errorlog: /tmp/error.log + hostname: 127.0.0.1 + listen: 0.0.0.0 + port: 57980 + threads: 4 + webport: 57990 + zk: 'NO' diff --git a/.doubanpde/scripts/bdb/gobeansproxy/57980/conf/route.yaml b/.doubanpde/scripts/bdb/gobeansproxy/57980/conf/route.yaml new file mode 100644 index 0000000..17f8888 --- /dev/null +++ b/.doubanpde/scripts/bdb/gobeansproxy/57980/conf/route.yaml @@ -0,0 +1,26 @@ +backup: +- 127.0.0.1:57983 +main: +- addr: 127.0.0.1:57980 + buckets: &id001 + - '0' + - '1' + - '2' + - '3' + - '4' + - '5' + - '6' + - '7' + - '8' + - '9' + - a + - b + - c + - d + - e + - f +- addr: 127.0.0.1:57981 + buckets: *id001 +- addr: 127.0.0.1:57982 + buckets: *id001 +numbucket: 16 diff --git a/.doubanpde/scripts/bdb/gobeansproxy/57981/conf/global.yaml b/.doubanpde/scripts/bdb/gobeansproxy/57981/conf/global.yaml new file mode 100644 index 0000000..6006eba --- /dev/null +++ b/.doubanpde/scripts/bdb/gobeansproxy/57981/conf/global.yaml @@ -0,0 +1,32 @@ +hstore: + data: + check_vhash: true + datafile_max_str: 4000M + flush_interval: 60 + flush_wake_str: 10M + no_gc_days: 7 + hint: + hint_index_interval_str: 32K + hint_merge_interval: 5 + hint_no_merged: true + hint_split_cap_str: 1M + htree: + tree_height: 3 + local: + home: /data +mc: + body_big_str: 5M + body_c_str: 0K + body_max_str: 50M + flush_max_str: 100M + max_key_len: 250 + max_req: 16 +server: + accesslog: /tmp/access.log + errorlog: /tmp/error.log + hostname: 127.0.0.1 + listen: 0.0.0.0 + port: 57981 + threads: 4 + webport: 57991 + zk: 'NO' diff --git a/.doubanpde/scripts/bdb/gobeansproxy/57981/conf/route.yaml b/.doubanpde/scripts/bdb/gobeansproxy/57981/conf/route.yaml new file mode 100644 index 0000000..17f8888 --- /dev/null +++ b/.doubanpde/scripts/bdb/gobeansproxy/57981/conf/route.yaml @@ -0,0 +1,26 @@ +backup: +- 127.0.0.1:57983 +main: +- addr: 127.0.0.1:57980 + buckets: &id001 + - '0' + - '1' + - '2' + - '3' + - '4' + - '5' + - '6' + - '7' + - '8' + - '9' + - a + - b + - c + - d + - e + - f +- addr: 127.0.0.1:57981 + buckets: *id001 +- addr: 127.0.0.1:57982 + buckets: *id001 +numbucket: 16 diff --git a/.doubanpde/scripts/bdb/gobeansproxy/57982/conf/global.yaml b/.doubanpde/scripts/bdb/gobeansproxy/57982/conf/global.yaml new file mode 100644 index 0000000..2a8c29c --- /dev/null +++ b/.doubanpde/scripts/bdb/gobeansproxy/57982/conf/global.yaml @@ -0,0 +1,32 @@ +hstore: + data: + check_vhash: true + datafile_max_str: 4000M + flush_interval: 60 + flush_wake_str: 10M + no_gc_days: 7 + hint: + hint_index_interval_str: 32K + hint_merge_interval: 5 + hint_no_merged: true + hint_split_cap_str: 1M + htree: + tree_height: 3 + local: + home: /data +mc: + body_big_str: 5M + body_c_str: 0K + body_max_str: 50M + flush_max_str: 100M + max_key_len: 250 + max_req: 16 +server: + accesslog: /tmp/access.log + errorlog: /tmp/error.log + hostname: 127.0.0.1 + listen: 0.0.0.0 + port: 57982 + threads: 4 + webport: 57992 + zk: 'NO' diff --git a/.doubanpde/scripts/bdb/gobeansproxy/57982/conf/route.yaml b/.doubanpde/scripts/bdb/gobeansproxy/57982/conf/route.yaml new file mode 100644 index 0000000..17f8888 --- /dev/null +++ b/.doubanpde/scripts/bdb/gobeansproxy/57982/conf/route.yaml @@ -0,0 +1,26 @@ +backup: +- 127.0.0.1:57983 +main: +- addr: 127.0.0.1:57980 + buckets: &id001 + - '0' + - '1' + - '2' + - '3' + - '4' + - '5' + - '6' + - '7' + - '8' + - '9' + - a + - b + - c + - d + - e + - f +- addr: 127.0.0.1:57981 + buckets: *id001 +- addr: 127.0.0.1:57982 + buckets: *id001 +numbucket: 16 diff --git a/.doubanpde/scripts/bdb/gobeansproxy/57983/conf/global.yaml b/.doubanpde/scripts/bdb/gobeansproxy/57983/conf/global.yaml new file mode 100644 index 0000000..4b49e25 --- /dev/null +++ b/.doubanpde/scripts/bdb/gobeansproxy/57983/conf/global.yaml @@ -0,0 +1,32 @@ +hstore: + data: + check_vhash: true + datafile_max_str: 4000M + flush_interval: 60 + flush_wake_str: 10M + no_gc_days: 7 + hint: + hint_index_interval_str: 32K + hint_merge_interval: 5 + hint_no_merged: true + hint_split_cap_str: 1M + htree: + tree_height: 3 + local: + home: /data +mc: + body_big_str: 5M + body_c_str: 0K + body_max_str: 50M + flush_max_str: 100M + max_key_len: 250 + max_req: 16 +server: + accesslog: /tmp/access.log + errorlog: /tmp/error.log + hostname: 127.0.0.1 + listen: 0.0.0.0 + port: 57983 + threads: 4 + webport: 57993 + zk: 'NO' diff --git a/.doubanpde/scripts/bdb/gobeansproxy/57983/conf/route.yaml b/.doubanpde/scripts/bdb/gobeansproxy/57983/conf/route.yaml new file mode 100644 index 0000000..17f8888 --- /dev/null +++ b/.doubanpde/scripts/bdb/gobeansproxy/57983/conf/route.yaml @@ -0,0 +1,26 @@ +backup: +- 127.0.0.1:57983 +main: +- addr: 127.0.0.1:57980 + buckets: &id001 + - '0' + - '1' + - '2' + - '3' + - '4' + - '5' + - '6' + - '7' + - '8' + - '9' + - a + - b + - c + - d + - e + - f +- addr: 127.0.0.1:57981 + buckets: *id001 +- addr: 127.0.0.1:57982 + buckets: *id001 +numbucket: 16 diff --git a/.doubanpde/scripts/bdb/gobeansproxy/dstore-only/conf/proxy.yaml b/.doubanpde/scripts/bdb/gobeansproxy/dstore-only/conf/proxy.yaml new file mode 100644 index 0000000..0f96031 --- /dev/null +++ b/.doubanpde/scripts/bdb/gobeansproxy/dstore-only/conf/proxy.yaml @@ -0,0 +1,78 @@ +cassandra: + default_key_space: doubandb + default_table: kvstore + enable: false + hosts: + - 127.0.0.1:9042 + timeout_ms: 1000 + connect_timeout_ms: 3000 + write_timeout_ms: 1000 + retry_num: 3 + reconnect_interval_sec: 180 + max_conn_for_getm: 10 + num_conns: 10 + username: "doubandb_test" + password: "doubandb_test" + consistency: "local_one" + prefix_table_dispatcher_cfg: + # if not enable will use default keyspace and table + enable: true + static: + # dispatch prefix1 key to table table_name1 + kvstore_prefix_a: + - "/prefix_a" + cfg_table: bdb_prefix_table_finder + cfg_keyspace: doubandb + prefix_rw_dispatcher_cfg: + enable: true + static: + # dispatch prefix /test_prefix_c/ to dual write + br1w1cr0w1: + - "/test_prefix_c/" + - "/test_prefix_d/" + br0w0cr1w1: + - "test_" + cfg_table: bdb_prefix_rw_switcher + cfg_keyspace: doubandb + default_storage: "br1w1cr0w0" + dual_write_err_cfg: + dump_to_dir: /tmp/log/gobeansproxy/proxy/ + log_file_name: dual_write_err.log + logger_level: "INFO" + rotate_size_mb: 100 + compress: true + max_ages: 7 + max_backups: 100 +dstore: + enable: true + connect_timeout_ms: 300 + dial_fail_silence_ms: 5000 + error_seconds: 10 + item_size_stats: 4096 + max_connect_errors: 3 + max_free_conns_per_host: 20 + n: 3 + r: 1 + read_timeout_ms: 2000 + response_time_min: 4000 + response_time_seconds: 10 + score_deviation: 10 + w: 2 + write_timeout_ms: 2000 +mc: + body_big_str: 5M + body_c_str: 0K + body_max_str: 50M + max_key_len: 250 + max_req: 16 +proxy: + accesslog: /tmp/log/gobeansproxy/proxy/access.log + errorlog: /tmp/log/gobeansproxy/proxy/error.log + hostname: 127.0.0.1 + listen: 0.0.0.0 + port: 47907 + staticdir: /var/lib/gobeansproxy + threads: 8 + webport: 47910 + zkpath: /gobeansproxy/test + zkservers: [] diff --git a/.doubanpde/scripts/bdb/gobeansproxy/dstore-only/conf/route.yaml b/.doubanpde/scripts/bdb/gobeansproxy/dstore-only/conf/route.yaml new file mode 100644 index 0000000..17f8888 --- /dev/null +++ b/.doubanpde/scripts/bdb/gobeansproxy/dstore-only/conf/route.yaml @@ -0,0 +1,26 @@ +backup: +- 127.0.0.1:57983 +main: +- addr: 127.0.0.1:57980 + buckets: &id001 + - '0' + - '1' + - '2' + - '3' + - '4' + - '5' + - '6' + - '7' + - '8' + - '9' + - a + - b + - c + - d + - e + - f +- addr: 127.0.0.1:57981 + buckets: *id001 +- addr: 127.0.0.1:57982 + buckets: *id001 +numbucket: 16 diff --git a/.doubanpde/scripts/bdb/gobeansproxy/prefix-switch-cfg/conf/proxy.yaml b/.doubanpde/scripts/bdb/gobeansproxy/prefix-switch-cfg/conf/proxy.yaml new file mode 100644 index 0000000..47b6165 --- /dev/null +++ b/.doubanpde/scripts/bdb/gobeansproxy/prefix-switch-cfg/conf/proxy.yaml @@ -0,0 +1,80 @@ +cassandra: + default_key_space: doubandb + default_table: kvstore + enable: true + hosts: + - 127.0.0.1:9042 + timeout_ms: 1000 + connect_timeout_ms: 3000 + write_timeout_ms: 1000 + retry_num: 3 + reconnect_interval_sec: 180 + max_conn_for_getm: 10 + num_conns: 10 + username: "doubandb_test" + password: "doubandb_test" + consistency: "local_one" + prefix_table_dispatcher_cfg: + # if not enable will use default keyspace and table + enable: true + static: + # dispatch prefix1 key to table table_name1 + kvstore_ark: + - "/ark" + cfg_table: bdb_prefix_table_finder + cfg_keyspace: doubandb + prefix_rw_dispatcher_cfg: + enable: true + static: + # dispatch prefix /test_prefix_c/ to dual write + br1w1cr0w1: + - "/test_prefix_c/" + - "/test_prefix_d/" + - "/arkark/" + br0w0cr1w1: + - "test_" + cfg_table: bdb_prefix_rw_switcher + cfg_keyspace: doubandb + default_storage: "br0w1cr1w1" + dual_write_err_cfg: + dump_to_dir: /tmp/gobeansproxy_prefix/proxy/ + log_file_name: dual_write_err.log + logger_level: "INFO" + rotate_size_mb: 100 + compress: true + max_ages: 7 + max_backups: 100 +dstore: + enable: true + connect_timeout_ms: 300 + dial_fail_silence_ms: 5000 + error_seconds: 10 + item_size_stats: 4096 + max_connect_errors: 3 + max_free_conns_per_host: 20 + n: 3 + r: 1 + read_timeout_ms: 2000 + response_time_min: 4000 + response_time_seconds: 10 + score_deviation: 10 + w: 2 + write_timeout_ms: 2000 +mc: + body_big_str: 5M + body_c_str: 0K + body_max_str: 50M + max_key_len: 250 + max_req: 16 +proxy: + accesslog: /tmp/gobeansproxy_prefix/proxy/access.log + errorlog: /tmp/gobeansproxy_prefix/proxy/error.log + hostname: 127.0.0.1 + listen: 0.0.0.0 + port: 47907 + staticdir: /var/lib/gobeansproxy + threads: 8 + webport: 47910 + zkpath: /gobeansproxy/test + zkservers: + - zk1:2181 diff --git a/.doubanpde/scripts/bdb/gobeansproxy/prefix-switch-cfg/conf/route.yaml b/.doubanpde/scripts/bdb/gobeansproxy/prefix-switch-cfg/conf/route.yaml new file mode 100644 index 0000000..17f8888 --- /dev/null +++ b/.doubanpde/scripts/bdb/gobeansproxy/prefix-switch-cfg/conf/route.yaml @@ -0,0 +1,26 @@ +backup: +- 127.0.0.1:57983 +main: +- addr: 127.0.0.1:57980 + buckets: &id001 + - '0' + - '1' + - '2' + - '3' + - '4' + - '5' + - '6' + - '7' + - '8' + - '9' + - a + - b + - c + - d + - e + - f +- addr: 127.0.0.1:57981 + buckets: *id001 +- addr: 127.0.0.1:57982 + buckets: *id001 +numbucket: 16 diff --git a/.doubanpde/scripts/bdb/rivenbeansproxy/conf/proxy.yaml b/.doubanpde/scripts/bdb/rivenbeansproxy/conf/proxy.yaml new file mode 100644 index 0000000..e3d679c --- /dev/null +++ b/.doubanpde/scripts/bdb/rivenbeansproxy/conf/proxy.yaml @@ -0,0 +1,80 @@ +cassandra: + default_key_space: doubandb + default_table: kvstore + enable: true + hosts: + - 127.0.0.1:9042 + timeout_ms: 1000 + connect_timeout_ms: 3000 + write_timeout_ms: 1000 + retry_num: 3 + reconnect_interval_sec: 180 + max_conn_for_getm: 10 + num_conns: 10 + username: "doubandb_test" + password: "doubandb_test" + consistency: "local_one" + prefix_table_dispatcher_cfg: + # if not enable will use default keyspace and table + enable: true + static: + # dispatch prefix1 key to table table_name1 + kvstore_ark: + - "/ark" + cfg_table: bdb_prefix_table_finder + cfg_keyspace: doubandb + prefix_rw_dispatcher_cfg: + enable: false + static: + # dispatch prefix /test_prefix_c/ to dual write + br1w1cr0w1: + - "/test_prefix_c/" + - "/test_prefix_d/" + - "/arkark/" + br0w0cr1w1: + - "test_" + cfg_table: bdb_prefix_rw_switcher + cfg_keyspace: doubandb + default_storage: "br0w0cr1w0" + dual_write_err_cfg: + dump_to_dir: /tmp/gobeansproxy_prefix/proxy/ + log_file_name: dual_write_err.log + logger_level: "INFO" + rotate_size_mb: 100 + compress: true + max_ages: 7 + max_backups: 100 +dstore: + enable: false + connect_timeout_ms: 300 + dial_fail_silence_ms: 5000 + error_seconds: 10 + item_size_stats: 4096 + max_connect_errors: 3 + max_free_conns_per_host: 20 + n: 3 + r: 1 + read_timeout_ms: 2000 + response_time_min: 4000 + response_time_seconds: 10 + score_deviation: 10 + w: 2 + write_timeout_ms: 2000 +mc: + body_big_str: 5M + body_c_str: 0K + body_max_str: 50M + max_key_len: 250 + max_req: 16 +proxy: + accesslog: /tmp/gobeansproxy_prefix/proxy/access.log + errorlog: /tmp/gobeansproxy_prefix/proxy/error.log + hostname: 127.0.0.1 + listen: 0.0.0.0 + port: 47907 + staticdir: /var/lib/gobeansproxy + threads: 8 + webport: 47910 + zkpath: /gobeansproxy/test + zkservers: + - zk1:2181 diff --git a/.doubanpde/scripts/cassandra/cassandra.yaml b/.doubanpde/scripts/cassandra/cassandra.yaml new file mode 100644 index 0000000..674016d --- /dev/null +++ b/.doubanpde/scripts/cassandra/cassandra.yaml @@ -0,0 +1,1820 @@ + +# Cassandra storage config YAML + +# NOTE: +# See https://cassandra.apache.org/doc/latest/configuration/ for +# full explanations of configuration directives +# /NOTE + +# The name of the cluster. This is mainly used to prevent machines in +# one logical cluster from joining another. +cluster_name: 'Test Cluster' + +# This defines the number of tokens randomly assigned to this node on the ring +# The more tokens, relative to other nodes, the larger the proportion of data +# that this node will store. You probably want all nodes to have the same number +# of tokens assuming they have equal hardware capability. +# +# If you leave this unspecified, Cassandra will use the default of 1 token for legacy compatibility, +# and will use the initial_token as described below. +# +# Specifying initial_token will override this setting on the node's initial start, +# on subsequent starts, this setting will apply even if initial token is set. +# +# See https://cassandra.apache.org/doc/latest/getting_started/production.html#tokens for +# best practice information about num_tokens. +# +num_tokens: 16 + +# Triggers automatic allocation of num_tokens tokens for this node. The allocation +# algorithm attempts to choose tokens in a way that optimizes replicated load over +# the nodes in the datacenter for the replica factor. +# +# The load assigned to each node will be close to proportional to its number of +# vnodes. +# +# Only supported with the Murmur3Partitioner. + +# Replica factor is determined via the replication strategy used by the specified +# keyspace. +# allocate_tokens_for_keyspace: KEYSPACE + +# Replica factor is explicitly set, regardless of keyspace or datacenter. +# This is the replica factor within the datacenter, like NTS. +allocate_tokens_for_local_replication_factor: 3 + +# initial_token allows you to specify tokens manually. While you can use it with +# vnodes (num_tokens > 1, above) -- in which case you should provide a +# comma-separated list -- it's primarily used when adding nodes to legacy clusters +# that do not have vnodes enabled. +# initial_token: + +# May either be "true" or "false" to enable globally +hinted_handoff_enabled: true + +# When hinted_handoff_enabled is true, a black list of data centers that will not +# perform hinted handoff +# hinted_handoff_disabled_datacenters: +# - DC1 +# - DC2 + +# this defines the maximum amount of time a dead host will have hints +# generated. After it has been dead this long, new hints for it will not be +# created until it has been seen alive and gone down again. +# Min unit: ms +max_hint_window: 3h + +# Maximum throttle in KiBs per second, per delivery thread. This will be +# reduced proportionally to the number of nodes in the cluster. (If there +# are two nodes in the cluster, each delivery thread will use the maximum +# rate; if there are three, each will throttle to half of the maximum, +# since we expect two nodes to be delivering hints simultaneously.) +# Min unit: KiB +hinted_handoff_throttle: 1024KiB + +# Number of threads with which to deliver hints; +# Consider increasing this number when you have multi-dc deployments, since +# cross-dc handoff tends to be slower +max_hints_delivery_threads: 2 + +# Directory where Cassandra should store hints. +# If not set, the default directory is $CASSANDRA_HOME/data/hints. +# hints_directory: /var/lib/cassandra/hints + +# How often hints should be flushed from the internal buffers to disk. +# Will *not* trigger fsync. +# Min unit: ms +hints_flush_period: 10000ms + +# Maximum size for a single hints file, in mebibytes. +# Min unit: MiB +max_hints_file_size: 128MiB + +# The file size limit to store hints for an unreachable host, in mebibytes. +# Once the local hints files have reached the limit, no more new hints will be created. +# Set a non-positive value will disable the size limit. +# max_hints_size_per_host: 0MiB + +# Enable / disable automatic cleanup for the expired and orphaned hints file. +# Disable the option in order to preserve those hints on the disk. +auto_hints_cleanup_enabled: false + +# Compression to apply to the hint files. If omitted, hints files +# will be written uncompressed. LZ4, Snappy, and Deflate compressors +# are supported. +#hints_compression: +# - class_name: LZ4Compressor +# parameters: +# - + +# Enable / disable persistent hint windows. +# +# If set to false, a hint will be stored only in case a respective node +# that hint is for is down less than or equal to max_hint_window. +# +# If set to true, a hint will be stored in case there is not any +# hint which was stored earlier than max_hint_window. This is for cases +# when a node keeps to restart and hints are not delivered yet, we would be saving +# hints for that node indefinitely. +# +# Defaults to true. +# +# hint_window_persistent_enabled: true + +# Maximum throttle in KiBs per second, total. This will be +# reduced proportionally to the number of nodes in the cluster. +# Min unit: KiB +batchlog_replay_throttle: 1024KiB + +# Authentication backend, implementing IAuthenticator; used to identify users +# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthenticator, +# PasswordAuthenticator}. +# +# - AllowAllAuthenticator performs no checks - set it to disable authentication. +# - PasswordAuthenticator relies on username/password pairs to authenticate +# users. It keeps usernames and hashed passwords in system_auth.roles table. +# Please increase system_auth keyspace replication factor if you use this authenticator. +# If using PasswordAuthenticator, CassandraRoleManager must also be used (see below) +# authenticator: AllowAllAuthenticator +authenticator: PasswordAuthenticator + +# Authorization backend, implementing IAuthorizer; used to limit access/provide permissions +# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthorizer, +# CassandraAuthorizer}. +# +# - AllowAllAuthorizer allows any action to any user - set it to disable authorization. +# - CassandraAuthorizer stores permissions in system_auth.role_permissions table. Please +# increase system_auth keyspace replication factor if you use this authorizer. +authorizer: CassandraAuthorizer + +# Part of the Authentication & Authorization backend, implementing IRoleManager; used +# to maintain grants and memberships between roles. +# Out of the box, Cassandra provides org.apache.cassandra.auth.CassandraRoleManager, +# which stores role information in the system_auth keyspace. Most functions of the +# IRoleManager require an authenticated login, so unless the configured IAuthenticator +# actually implements authentication, most of this functionality will be unavailable. +# +# - CassandraRoleManager stores role data in the system_auth keyspace. Please +# increase system_auth keyspace replication factor if you use this role manager. +role_manager: CassandraRoleManager + +# Network authorization backend, implementing INetworkAuthorizer; used to restrict user +# access to certain DCs +# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllNetworkAuthorizer, +# CassandraNetworkAuthorizer}. +# +# - AllowAllNetworkAuthorizer allows access to any DC to any user - set it to disable authorization. +# - CassandraNetworkAuthorizer stores permissions in system_auth.network_permissions table. Please +# increase system_auth keyspace replication factor if you use this authorizer. +network_authorizer: AllowAllNetworkAuthorizer + +# Depending on the auth strategy of the cluster, it can be beneficial to iterate +# from root to table (root -> ks -> table) instead of table to root (table -> ks -> root). +# As the auth entries are whitelisting, once a permission is found you know it to be +# valid. We default to false as the legacy behavior is to query at the table level then +# move back up to the root. See CASSANDRA-17016 for details. +# traverse_auth_from_root: false + +# Validity period for roles cache (fetching granted roles can be an expensive +# operation depending on the role manager, CassandraRoleManager is one example) +# Granted roles are cached for authenticated sessions in AuthenticatedUser and +# after the period specified here, become eligible for (async) reload. +# Defaults to 2000, set to 0 to disable caching entirely. +# Will be disabled automatically for AllowAllAuthenticator. +# For a long-running cache using roles_cache_active_update, consider +# setting to something longer such as a daily validation: 86400000 +# Min unit: ms +roles_validity: 2000ms + +# Refresh interval for roles cache (if enabled). +# After this interval, cache entries become eligible for refresh. Upon next +# access, an async reload is scheduled and the old value returned until it +# completes. If roles_validity is non-zero, then this must be +# also. +# This setting is also used to inform the interval of auto-updating if +# using roles_cache_active_update. +# Defaults to the same value as roles_validity. +# For a long-running cache, consider setting this to 60000 (1 hour) etc. +# Min unit: ms +# roles_update_interval: 2000ms + +# If true, cache contents are actively updated by a background task at the +# interval set by roles_update_interval. If false, cache entries +# become eligible for refresh after their update interval. Upon next access, +# an async reload is scheduled and the old value returned until it completes. +# roles_cache_active_update: false + +# Validity period for permissions cache (fetching permissions can be an +# expensive operation depending on the authorizer, CassandraAuthorizer is +# one example). Defaults to 2000, set to 0 to disable. +# Will be disabled automatically for AllowAllAuthorizer. +# For a long-running cache using permissions_cache_active_update, consider +# setting to something longer such as a daily validation: 86400000ms +# Min unit: ms +permissions_validity: 2000ms + +# Refresh interval for permissions cache (if enabled). +# After this interval, cache entries become eligible for refresh. Upon next +# access, an async reload is scheduled and the old value returned until it +# completes. If permissions_validity is non-zero, then this must be +# also. +# This setting is also used to inform the interval of auto-updating if +# using permissions_cache_active_update. +# Defaults to the same value as permissions_validity. +# For a longer-running permissions cache, consider setting to update hourly (60000) +# Min unit: ms +# permissions_update_interval: 2000ms + +# If true, cache contents are actively updated by a background task at the +# interval set by permissions_update_interval. If false, cache entries +# become eligible for refresh after their update interval. Upon next access, +# an async reload is scheduled and the old value returned until it completes. +# permissions_cache_active_update: false + +# Validity period for credentials cache. This cache is tightly coupled to +# the provided PasswordAuthenticator implementation of IAuthenticator. If +# another IAuthenticator implementation is configured, this cache will not +# be automatically used and so the following settings will have no effect. +# Please note, credentials are cached in their encrypted form, so while +# activating this cache may reduce the number of queries made to the +# underlying table, it may not bring a significant reduction in the +# latency of individual authentication attempts. +# Defaults to 2000, set to 0 to disable credentials caching. +# For a long-running cache using credentials_cache_active_update, consider +# setting to something longer such as a daily validation: 86400000 +# Min unit: ms +credentials_validity: 2000ms + +# Refresh interval for credentials cache (if enabled). +# After this interval, cache entries become eligible for refresh. Upon next +# access, an async reload is scheduled and the old value returned until it +# completes. If credentials_validity is non-zero, then this must be +# also. +# This setting is also used to inform the interval of auto-updating if +# using credentials_cache_active_update. +# Defaults to the same value as credentials_validity. +# For a longer-running permissions cache, consider setting to update hourly (60000) +# Min unit: ms +# credentials_update_interval: 2000ms + +# If true, cache contents are actively updated by a background task at the +# interval set by credentials_update_interval. If false (default), cache entries +# become eligible for refresh after their update interval. Upon next access, +# an async reload is scheduled and the old value returned until it completes. +# credentials_cache_active_update: false + +# The partitioner is responsible for distributing groups of rows (by +# partition key) across nodes in the cluster. The partitioner can NOT be +# changed without reloading all data. If you are adding nodes or upgrading, +# you should set this to the same partitioner that you are currently using. +# +# The default partitioner is the Murmur3Partitioner. Older partitioners +# such as the RandomPartitioner, ByteOrderedPartitioner, and +# OrderPreservingPartitioner have been included for backward compatibility only. +# For new clusters, you should NOT change this value. +# +partitioner: org.apache.cassandra.dht.Murmur3Partitioner + +# Directories where Cassandra should store data on disk. If multiple +# directories are specified, Cassandra will spread data evenly across +# them by partitioning the token ranges. +# If not set, the default directory is $CASSANDRA_HOME/data/data. +# data_file_directories: +# - /var/lib/cassandra/data + +# Directory were Cassandra should store the data of the local system keyspaces. +# By default Cassandra will store the data of the local system keyspaces in the first of the data directories specified +# by data_file_directories. +# This approach ensures that if one of the other disks is lost Cassandra can continue to operate. For extra security +# this setting allows to store those data on a different directory that provides redundancy. +# local_system_data_file_directory: + +# commit log. when running on magnetic HDD, this should be a +# separate spindle than the data directories. +# If not set, the default directory is $CASSANDRA_HOME/data/commitlog. +# commitlog_directory: /var/lib/cassandra/commitlog + +# Enable / disable CDC functionality on a per-node basis. This modifies the logic used +# for write path allocation rejection (standard: never reject. cdc: reject Mutation +# containing a CDC-enabled table if at space limit in cdc_raw_directory). +cdc_enabled: false + +# CommitLogSegments are moved to this directory on flush if cdc_enabled: true and the +# segment contains mutations for a CDC-enabled table. This should be placed on a +# separate spindle than the data directories. If not set, the default directory is +# $CASSANDRA_HOME/data/cdc_raw. +# cdc_raw_directory: /var/lib/cassandra/cdc_raw + +# Policy for data disk failures: +# +# die +# shut down gossip and client transports and kill the JVM for any fs errors or +# single-sstable errors, so the node can be replaced. +# +# stop_paranoid +# shut down gossip and client transports even for single-sstable errors, +# kill the JVM for errors during startup. +# +# stop +# shut down gossip and client transports, leaving the node effectively dead, but +# can still be inspected via JMX, kill the JVM for errors during startup. +# +# best_effort +# stop using the failed disk and respond to requests based on +# remaining available sstables. This means you WILL see obsolete +# data at CL.ONE! +# +# ignore +# ignore fatal errors and let requests fail, as in pre-1.2 Cassandra +disk_failure_policy: stop + +# Policy for commit disk failures: +# +# die +# shut down the node and kill the JVM, so the node can be replaced. +# +# stop +# shut down the node, leaving the node effectively dead, but +# can still be inspected via JMX. +# +# stop_commit +# shutdown the commit log, letting writes collect but +# continuing to service reads, as in pre-2.0.5 Cassandra +# +# ignore +# ignore fatal errors and let the batches fail +commit_failure_policy: stop + +# Maximum size of the native protocol prepared statement cache +# +# Valid values are either "auto" (omitting the value) or a value greater 0. +# +# Note that specifying a too large value will result in long running GCs and possbily +# out-of-memory errors. Keep the value at a small fraction of the heap. +# +# If you constantly see "prepared statements discarded in the last minute because +# cache limit reached" messages, the first step is to investigate the root cause +# of these messages and check whether prepared statements are used correctly - +# i.e. use bind markers for variable parts. +# +# Do only change the default value, if you really have more prepared statements than +# fit in the cache. In most cases it is not neccessary to change this value. +# Constantly re-preparing statements is a performance penalty. +# +# Default value ("auto") is 1/256th of the heap or 10MiB, whichever is greater +# Min unit: MiB +prepared_statements_cache_size: + +# Maximum size of the key cache in memory. +# +# Each key cache hit saves 1 seek and each row cache hit saves 2 seeks at the +# minimum, sometimes more. The key cache is fairly tiny for the amount of +# time it saves, so it's worthwhile to use it at large numbers. +# The row cache saves even more time, but must contain the entire row, +# so it is extremely space-intensive. It's best to only use the +# row cache if you have hot rows or static rows. +# +# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup. +# +# Default value is empty to make it "auto" (min(5% of Heap (in MiB), 100MiB)). Set to 0 to disable key cache. +# Min unit: MiB +key_cache_size: + +# Duration in seconds after which Cassandra should +# save the key cache. Caches are saved to saved_caches_directory as +# specified in this configuration file. +# +# Saved caches greatly improve cold-start speeds, and is relatively cheap in +# terms of I/O for the key cache. Row cache saving is much more expensive and +# has limited use. +# +# Default is 14400 or 4 hours. +# Min unit: s +key_cache_save_period: 4h + +# Number of keys from the key cache to save +# Disabled by default, meaning all keys are going to be saved +# key_cache_keys_to_save: 100 + +# Row cache implementation class name. Available implementations: +# +# org.apache.cassandra.cache.OHCProvider +# Fully off-heap row cache implementation (default). +# +# org.apache.cassandra.cache.SerializingCacheProvider +# This is the row cache implementation availabile +# in previous releases of Cassandra. +# row_cache_class_name: org.apache.cassandra.cache.OHCProvider + +# Maximum size of the row cache in memory. +# Please note that OHC cache implementation requires some additional off-heap memory to manage +# the map structures and some in-flight memory during operations before/after cache entries can be +# accounted against the cache capacity. This overhead is usually small compared to the whole capacity. +# Do not specify more memory that the system can afford in the worst usual situation and leave some +# headroom for OS block level cache. Do never allow your system to swap. +# +# Default value is 0, to disable row caching. +# Min unit: MiB +row_cache_size: 0MiB + +# Duration in seconds after which Cassandra should save the row cache. +# Caches are saved to saved_caches_directory as specified in this configuration file. +# +# Saved caches greatly improve cold-start speeds, and is relatively cheap in +# terms of I/O for the key cache. Row cache saving is much more expensive and +# has limited use. +# +# Default is 0 to disable saving the row cache. +# Min unit: s +row_cache_save_period: 0s + +# Number of keys from the row cache to save. +# Specify 0 (which is the default), meaning all keys are going to be saved +# row_cache_keys_to_save: 100 + +# Maximum size of the counter cache in memory. +# +# Counter cache helps to reduce counter locks' contention for hot counter cells. +# In case of RF = 1 a counter cache hit will cause Cassandra to skip the read before +# write entirely. With RF > 1 a counter cache hit will still help to reduce the duration +# of the lock hold, helping with hot counter cell updates, but will not allow skipping +# the read entirely. Only the local (clock, count) tuple of a counter cell is kept +# in memory, not the whole counter, so it's relatively cheap. +# +# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup. +# +# Default value is empty to make it "auto" (min(2.5% of Heap (in MiB), 50MiB)). Set to 0 to disable counter cache. +# NOTE: if you perform counter deletes and rely on low gcgs, you should disable the counter cache. +# Min unit: MiB +counter_cache_size: + +# Duration in seconds after which Cassandra should +# save the counter cache (keys only). Caches are saved to saved_caches_directory as +# specified in this configuration file. +# +# Default is 7200 or 2 hours. +# Min unit: s +counter_cache_save_period: 7200s + +# Number of keys from the counter cache to save +# Disabled by default, meaning all keys are going to be saved +# counter_cache_keys_to_save: 100 + +# saved caches +# If not set, the default directory is $CASSANDRA_HOME/data/saved_caches. +# saved_caches_directory: /var/lib/cassandra/saved_caches + +# Number of seconds the server will wait for each cache (row, key, etc ...) to load while starting +# the Cassandra process. Setting this to zero is equivalent to disabling all cache loading on startup +# while still having the cache during runtime. +# Min unit: s +# cache_load_timeout: 30s + +# commitlog_sync may be either "periodic", "group", or "batch." +# +# When in batch mode, Cassandra won't ack writes until the commit log +# has been flushed to disk. Each incoming write will trigger the flush task. +# commitlog_sync_batch_window_in_ms is a deprecated value. Previously it had +# almost no value, and is being removed. +# +# commitlog_sync_batch_window_in_ms: 2 +# +# group mode is similar to batch mode, where Cassandra will not ack writes +# until the commit log has been flushed to disk. The difference is group +# mode will wait up to commitlog_sync_group_window between flushes. +# +# Min unit: ms +# commitlog_sync_group_window: 1000ms +# +# the default option is "periodic" where writes may be acked immediately +# and the CommitLog is simply synced every commitlog_sync_period +# milliseconds. +commitlog_sync: periodic +# Min unit: ms +commitlog_sync_period: 10000ms + +# When in periodic commitlog mode, the number of milliseconds to block writes +# while waiting for a slow disk flush to complete. +# Min unit: ms +# periodic_commitlog_sync_lag_block: + +# The size of the individual commitlog file segments. A commitlog +# segment may be archived, deleted, or recycled once all the data +# in it (potentially from each columnfamily in the system) has been +# flushed to sstables. +# +# The default size is 32, which is almost always fine, but if you are +# archiving commitlog segments (see commitlog_archiving.properties), +# then you probably want a finer granularity of archiving; 8 or 16 MB +# is reasonable. +# Max mutation size is also configurable via max_mutation_size setting in +# cassandra.yaml. The default is half the size commitlog_segment_size in bytes. +# This should be positive and less than 2048. +# +# NOTE: If max_mutation_size is set explicitly then commitlog_segment_size must +# be set to at least twice the size of max_mutation_size +# +# Min unit: MiB +commitlog_segment_size: 32MiB + +# Compression to apply to the commit log. If omitted, the commit log +# will be written uncompressed. LZ4, Snappy, and Deflate compressors +# are supported. +# commitlog_compression: +# - class_name: LZ4Compressor +# parameters: +# - + +# Compression to apply to SSTables as they flush for compressed tables. +# Note that tables without compression enabled do not respect this flag. +# +# As high ratio compressors like LZ4HC, Zstd, and Deflate can potentially +# block flushes for too long, the default is to flush with a known fast +# compressor in those cases. Options are: +# +# none : Flush without compressing blocks but while still doing checksums. +# fast : Flush with a fast compressor. If the table is already using a +# fast compressor that compressor is used. +# table: Always flush with the same compressor that the table uses. This +# was the pre 4.0 behavior. +# +# flush_compression: fast + +# any class that implements the SeedProvider interface and has a +# constructor that takes a Map of parameters will do. +seed_provider: + # Addresses of hosts that are deemed contact points. + # Cassandra nodes use this list of hosts to find each other and learn + # the topology of the ring. You must change this if you are running + # multiple nodes! + - class_name: org.apache.cassandra.locator.SimpleSeedProvider + parameters: + # seeds is actually a comma-delimited list of addresses. + # Ex: ",," + - seeds: "10.0.2.100" + +# For workloads with more data than can fit in memory, Cassandra's +# bottleneck will be reads that need to fetch data from +# disk. "concurrent_reads" should be set to (16 * number_of_drives) in +# order to allow the operations to enqueue low enough in the stack +# that the OS and drives can reorder them. Same applies to +# "concurrent_counter_writes", since counter writes read the current +# values before incrementing and writing them back. +# +# On the other hand, since writes are almost never IO bound, the ideal +# number of "concurrent_writes" is dependent on the number of cores in +# your system; (8 * number_of_cores) is a good rule of thumb. +concurrent_reads: 32 +concurrent_writes: 32 +concurrent_counter_writes: 32 + +# For materialized view writes, as there is a read involved, so this should +# be limited by the less of concurrent reads or concurrent writes. +concurrent_materialized_view_writes: 32 + +# Maximum memory to use for inter-node and client-server networking buffers. +# +# Defaults to the smaller of 1/16 of heap or 128MB. This pool is allocated off-heap, +# so is in addition to the memory allocated for heap. The cache also has on-heap +# overhead which is roughly 128 bytes per chunk (i.e. 0.2% of the reserved size +# if the default 64k chunk size is used). +# Memory is only allocated when needed. +# Min unit: MiB +# networking_cache_size: 128MiB + +# Enable the sstable chunk cache. The chunk cache will store recently accessed +# sections of the sstable in-memory as uncompressed buffers. +# file_cache_enabled: false + +# Maximum memory to use for sstable chunk cache and buffer pooling. +# 32MB of this are reserved for pooling buffers, the rest is used for chunk cache +# that holds uncompressed sstable chunks. +# Defaults to the smaller of 1/4 of heap or 512MB. This pool is allocated off-heap, +# so is in addition to the memory allocated for heap. The cache also has on-heap +# overhead which is roughly 128 bytes per chunk (i.e. 0.2% of the reserved size +# if the default 64k chunk size is used). +# Memory is only allocated when needed. +# Min unit: MiB +# file_cache_size: 512MiB + +# Flag indicating whether to allocate on or off heap when the sstable buffer +# pool is exhausted, that is when it has exceeded the maximum memory +# file_cache_size, beyond which it will not cache buffers but allocate on request. + +# buffer_pool_use_heap_if_exhausted: true + +# The strategy for optimizing disk read +# Possible values are: +# ssd (for solid state disks, the default) +# spinning (for spinning disks) +# disk_optimization_strategy: ssd + +# Total permitted memory to use for memtables. Cassandra will stop +# accepting writes when the limit is exceeded until a flush completes, +# and will trigger a flush based on memtable_cleanup_threshold +# If omitted, Cassandra will set both to 1/4 the size of the heap. +# Min unit: MiB +# memtable_heap_space: 2048MiB +# Min unit: MiB +# memtable_offheap_space: 2048MiB + +# memtable_cleanup_threshold is deprecated. The default calculation +# is the only reasonable choice. See the comments on memtable_flush_writers +# for more information. +# +# Ratio of occupied non-flushing memtable size to total permitted size +# that will trigger a flush of the largest memtable. Larger mct will +# mean larger flushes and hence less compaction, but also less concurrent +# flush activity which can make it difficult to keep your disks fed +# under heavy write load. +# +# memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1) +# memtable_cleanup_threshold: 0.11 + +# Specify the way Cassandra allocates and manages memtable memory. +# Options are: +# +# heap_buffers +# on heap nio buffers +# +# offheap_buffers +# off heap (direct) nio buffers +# +# offheap_objects +# off heap objects +memtable_allocation_type: heap_buffers + +# Limit memory usage for Merkle tree calculations during repairs. The default +# is 1/16th of the available heap. The main tradeoff is that smaller trees +# have less resolution, which can lead to over-streaming data. If you see heap +# pressure during repairs, consider lowering this, but you cannot go below +# one mebibyte. If you see lots of over-streaming, consider raising +# this or using subrange repair. +# +# For more details see https://issues.apache.org/jira/browse/CASSANDRA-14096. +# +# Min unit: MiB +# repair_session_space: + +# Total space to use for commit logs on disk. +# +# If space gets above this value, Cassandra will flush every dirty CF +# in the oldest segment and remove it. So a small total commitlog space +# will tend to cause more flush activity on less-active columnfamilies. +# +# The default value is the smaller of 8192, and 1/4 of the total space +# of the commitlog volume. +# +# commitlog_total_space: 8192MiB + +# This sets the number of memtable flush writer threads per disk +# as well as the total number of memtables that can be flushed concurrently. +# These are generally a combination of compute and IO bound. +# +# Memtable flushing is more CPU efficient than memtable ingest and a single thread +# can keep up with the ingest rate of a whole server on a single fast disk +# until it temporarily becomes IO bound under contention typically with compaction. +# At that point you need multiple flush threads. At some point in the future +# it may become CPU bound all the time. +# +# You can tell if flushing is falling behind using the MemtablePool.BlockedOnAllocation +# metric which should be 0, but will be non-zero if threads are blocked waiting on flushing +# to free memory. +# +# memtable_flush_writers defaults to two for a single data directory. +# This means that two memtables can be flushed concurrently to the single data directory. +# If you have multiple data directories the default is one memtable flushing at a time +# but the flush will use a thread per data directory so you will get two or more writers. +# +# Two is generally enough to flush on a fast disk [array] mounted as a single data directory. +# Adding more flush writers will result in smaller more frequent flushes that introduce more +# compaction overhead. +# +# There is a direct tradeoff between number of memtables that can be flushed concurrently +# and flush size and frequency. More is not better you just need enough flush writers +# to never stall waiting for flushing to free memory. +# +# memtable_flush_writers: 2 + +# Total space to use for change-data-capture logs on disk. +# +# If space gets above this value, Cassandra will throw WriteTimeoutException +# on Mutations including tables with CDC enabled. A CDCCompactor is responsible +# for parsing the raw CDC logs and deleting them when parsing is completed. +# +# The default value is the min of 4096 MiB and 1/8th of the total space +# of the drive where cdc_raw_directory resides. +# Min unit: MiB +# cdc_total_space: 4096MiB + +# When we hit our cdc_raw limit and the CDCCompactor is either running behind +# or experiencing backpressure, we check at the following interval to see if any +# new space for cdc-tracked tables has been made available. Default to 250ms +# Min unit: ms +# cdc_free_space_check_interval: 250ms + +# A fixed memory pool size in MB for for SSTable index summaries. If left +# empty, this will default to 5% of the heap size. If the memory usage of +# all index summaries exceeds this limit, SSTables with low read rates will +# shrink their index summaries in order to meet this limit. However, this +# is a best-effort process. In extreme conditions Cassandra may need to use +# more than this amount of memory. +# Min unit: KiB +index_summary_capacity: + +# How frequently index summaries should be resampled. This is done +# periodically to redistribute memory from the fixed-size pool to sstables +# proportional their recent read rates. Setting to null value will disable this +# process, leaving existing index summaries at their current sampling level. +# Min unit: m +index_summary_resize_interval: 60m + +# Whether to, when doing sequential writing, fsync() at intervals in +# order to force the operating system to flush the dirty +# buffers. Enable this to avoid sudden dirty buffer flushing from +# impacting read latencies. Almost always a good idea on SSDs; not +# necessarily on platters. +trickle_fsync: false +# Min unit: KiB +trickle_fsync_interval: 10240KiB + +# TCP port, for commands and data +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +storage_port: 7000 + +# SSL port, for legacy encrypted communication. This property is unused unless enabled in +# server_encryption_options (see below). As of cassandra 4.0, this property is deprecated +# as a single port can be used for either/both secure and insecure connections. +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +ssl_storage_port: 7001 + +# Address or interface to bind to and tell other Cassandra nodes to connect to. +# You _must_ change this if you want multiple nodes to be able to communicate! +# +# Set listen_address OR listen_interface, not both. +# +# Leaving it blank leaves it up to InetAddress.getLocalHost(). This +# will always do the Right Thing _if_ the node is properly configured +# (hostname, name resolution, etc), and the Right Thing is to use the +# address associated with the hostname (it might not be). If unresolvable +# it will fall back to InetAddress.getLoopbackAddress(), which is wrong for production systems. +# +# Setting listen_address to 0.0.0.0 is always wrong. +# +listen_address: 10.0.2.100 + +# Set listen_address OR listen_interface, not both. Interfaces must correspond +# to a single address, IP aliasing is not supported. +# listen_interface: eth0 + +# If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address +# you can specify which should be chosen using listen_interface_prefer_ipv6. If false the first ipv4 +# address will be used. If true the first ipv6 address will be used. Defaults to false preferring +# ipv4. If there is only one address it will be selected regardless of ipv4/ipv6. +# listen_interface_prefer_ipv6: false + +# Address to broadcast to other Cassandra nodes +# Leaving this blank will set it to the same value as listen_address +broadcast_address: 10.0.2.100 + +# When using multiple physical network interfaces, set this +# to true to listen on broadcast_address in addition to +# the listen_address, allowing nodes to communicate in both +# interfaces. +# Ignore this property if the network configuration automatically +# routes between the public and private networks such as EC2. +# listen_on_broadcast_address: false + +# Internode authentication backend, implementing IInternodeAuthenticator; +# used to allow/disallow connections from peer nodes. +# internode_authenticator: org.apache.cassandra.auth.AllowAllInternodeAuthenticator + +# Whether to start the native transport server. +# The address on which the native transport is bound is defined by rpc_address. +start_native_transport: true +# port for the CQL native transport to listen for clients on +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +native_transport_port: 9042 +# Enabling native transport encryption in client_encryption_options allows you to either use +# encryption for the standard port or to use a dedicated, additional port along with the unencrypted +# standard native_transport_port. +# Enabling client encryption and keeping native_transport_port_ssl disabled will use encryption +# for native_transport_port. Setting native_transport_port_ssl to a different value +# from native_transport_port will use encryption for native_transport_port_ssl while +# keeping native_transport_port unencrypted. +# native_transport_port_ssl: 9142 +# The maximum threads for handling requests (note that idle threads are stopped +# after 30 seconds so there is not corresponding minimum setting). +# native_transport_max_threads: 128 +# +# The maximum size of allowed frame. Frame (requests) larger than this will +# be rejected as invalid. The default is 16MiB. If you're changing this parameter, +# you may want to adjust max_value_size accordingly. This should be positive and less than 2048. +# Min unit: MiB +# native_transport_max_frame_size: 16MiB + +# The maximum number of concurrent client connections. +# The default is -1, which means unlimited. +# native_transport_max_concurrent_connections: -1 + +# The maximum number of concurrent client connections per source ip. +# The default is -1, which means unlimited. +# native_transport_max_concurrent_connections_per_ip: -1 + +# Controls whether Cassandra honors older, yet currently supported, protocol versions. +# The default is true, which means all supported protocols will be honored. +native_transport_allow_older_protocols: true + +# Controls when idle client connections are closed. Idle connections are ones that had neither reads +# nor writes for a time period. +# +# Clients may implement heartbeats by sending OPTIONS native protocol message after a timeout, which +# will reset idle timeout timer on the server side. To close idle client connections, corresponding +# values for heartbeat intervals have to be set on the client side. +# +# Idle connection timeouts are disabled by default. +# Min unit: ms +# native_transport_idle_timeout: 60000ms + +# When enabled, limits the number of native transport requests dispatched for processing per second. +# Behavior once the limit has been breached depends on the value of THROW_ON_OVERLOAD specified in +# the STARTUP message sent by the client during connection establishment. (See section "4.1.1. STARTUP" +# in "CQL BINARY PROTOCOL v5".) With the THROW_ON_OVERLOAD flag enabled, messages that breach the limit +# are dropped, and an OverloadedException is thrown for the client to handle. When the flag is not +# enabled, the server will stop consuming messages from the channel/socket, putting backpressure on +# the client while already dispatched messages are processed. +# native_transport_rate_limiting_enabled: false +# native_transport_max_requests_per_second: 1000000 + +# The address or interface to bind the native transport server to. +# +# Set rpc_address OR rpc_interface, not both. +# +# Leaving rpc_address blank has the same effect as on listen_address +# (i.e. it will be based on the configured hostname of the node). +# +# Note that unlike listen_address, you can specify 0.0.0.0, but you must also +# set broadcast_rpc_address to a value other than 0.0.0.0. +# +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +rpc_address: 0.0.0.0 + +# Set rpc_address OR rpc_interface, not both. Interfaces must correspond +# to a single address, IP aliasing is not supported. +# rpc_interface: eth1 + +# If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address +# you can specify which should be chosen using rpc_interface_prefer_ipv6. If false the first ipv4 +# address will be used. If true the first ipv6 address will be used. Defaults to false preferring +# ipv4. If there is only one address it will be selected regardless of ipv4/ipv6. +# rpc_interface_prefer_ipv6: false + +# RPC address to broadcast to drivers and other Cassandra nodes. This cannot +# be set to 0.0.0.0. If left blank, this will be set to the value of +# rpc_address. If rpc_address is set to 0.0.0.0, broadcast_rpc_address must +# be set. +broadcast_rpc_address: 10.0.2.100 + +# enable or disable keepalive on rpc/native connections +rpc_keepalive: true + +# Uncomment to set socket buffer size for internode communication +# Note that when setting this, the buffer size is limited by net.core.wmem_max +# and when not setting it it is defined by net.ipv4.tcp_wmem +# See also: +# /proc/sys/net/core/wmem_max +# /proc/sys/net/core/rmem_max +# /proc/sys/net/ipv4/tcp_wmem +# /proc/sys/net/ipv4/tcp_wmem +# and 'man tcp' +# Min unit: B +# internode_socket_send_buffer_size: + +# Uncomment to set socket buffer size for internode communication +# Note that when setting this, the buffer size is limited by net.core.wmem_max +# and when not setting it it is defined by net.ipv4.tcp_wmem +# Min unit: B +# internode_socket_receive_buffer_size: + +# Set to true to have Cassandra create a hard link to each sstable +# flushed or streamed locally in a backups/ subdirectory of the +# keyspace data. Removing these links is the operator's +# responsibility. +incremental_backups: false + +# Whether or not to take a snapshot before each compaction. Be +# careful using this option, since Cassandra won't clean up the +# snapshots for you. Mostly useful if you're paranoid when there +# is a data format change. +snapshot_before_compaction: false + +# Whether or not a snapshot is taken of the data before keyspace truncation +# or dropping of column families. The STRONGLY advised default of true +# should be used to provide data safety. If you set this flag to false, you will +# lose data on truncation or drop. +auto_snapshot: true + +# Adds a time-to-live (TTL) to auto snapshots generated by table +# truncation or drop (when enabled). +# After the TTL is elapsed, the snapshot is automatically cleared. +# By default, auto snapshots *do not* have TTL, uncomment the property below +# to enable TTL on auto snapshots. +# Accepted units: d (days), h (hours) or m (minutes) +# auto_snapshot_ttl: 30d + +# The act of creating or clearing a snapshot involves creating or removing +# potentially tens of thousands of links, which can cause significant performance +# impact, especially on consumer grade SSDs. A non-zero value here can +# be used to throttle these links to avoid negative performance impact of +# taking and clearing snapshots +snapshot_links_per_second: 0 + +# Granularity of the collation index of rows within a partition. +# Increase if your rows are large, or if you have a very large +# number of rows per partition. The competing goals are these: +# +# - a smaller granularity means more index entries are generated +# and looking up rows withing the partition by collation column +# is faster +# - but, Cassandra will keep the collation index in memory for hot +# rows (as part of the key cache), so a larger granularity means +# you can cache more hot rows +# Min unit: KiB +column_index_size: 64KiB + +# Per sstable indexed key cache entries (the collation index in memory +# mentioned above) exceeding this size will not be held on heap. +# This means that only partition information is held on heap and the +# index entries are read from disk. +# +# Note that this size refers to the size of the +# serialized index information and not the size of the partition. +# Min unit: KiB +column_index_cache_size: 2KiB + +# Number of simultaneous compactions to allow, NOT including +# validation "compactions" for anti-entropy repair. Simultaneous +# compactions can help preserve read performance in a mixed read/write +# workload, by mitigating the tendency of small sstables to accumulate +# during a single long running compactions. The default is usually +# fine and if you experience problems with compaction running too +# slowly or too fast, you should look at +# compaction_throughput first. +# +# concurrent_compactors defaults to the smaller of (number of disks, +# number of cores), with a minimum of 2 and a maximum of 8. +# +# If your data directories are backed by SSD, you should increase this +# to the number of cores. +# concurrent_compactors: 1 + +# Number of simultaneous repair validations to allow. If not set or set to +# a value less than 1, it defaults to the value of concurrent_compactors. +# To set a value greeater than concurrent_compactors at startup, the system +# property cassandra.allow_unlimited_concurrent_validations must be set to +# true. To dynamically resize to a value > concurrent_compactors on a running +# node, first call the bypassConcurrentValidatorsLimit method on the +# org.apache.cassandra.db:type=StorageService mbean +# concurrent_validations: 0 + +# Number of simultaneous materialized view builder tasks to allow. +concurrent_materialized_view_builders: 1 + +# Throttles compaction to the given total throughput across the entire +# system. The faster you insert data, the faster you need to compact in +# order to keep the sstable count down, but in general, setting this to +# 16 to 32 times the rate you are inserting data is more than sufficient. +# Setting this to 0 disables throttling. Note that this accounts for all types +# of compaction, including validation compaction (building Merkle trees +# for repairs). +compaction_throughput: 64MiB/s + +# When compacting, the replacement sstable(s) can be opened before they +# are completely written, and used in place of the prior sstables for +# any range that has been written. This helps to smoothly transfer reads +# between the sstables, reducing page cache churn and keeping hot rows hot +# Set sstable_preemptive_open_interval to null for disabled which is equivalent to +# sstable_preemptive_open_interval_in_mb being negative +# Min unit: MiB +sstable_preemptive_open_interval: 50MiB + +# Starting from 4.1 sstables support UUID based generation identifiers. They are disabled by default +# because once enabled, there is no easy way to downgrade. When the node is restarted with this option +# set to true, each newly created sstable will have a UUID based generation identifier and such files are +# not readable by previous Cassandra versions. At some point, this option will become true by default +# and eventually get removed from the configuration. +uuid_sstable_identifiers_enabled: false + +# When enabled, permits Cassandra to zero-copy stream entire eligible +# SSTables between nodes, including every component. +# This speeds up the network transfer significantly subject to +# throttling specified by entire_sstable_stream_throughput_outbound, +# and entire_sstable_inter_dc_stream_throughput_outbound +# for inter-DC transfers. +# Enabling this will reduce the GC pressure on sending and receiving node. +# When unset, the default is enabled. While this feature tries to keep the +# disks balanced, it cannot guarantee it. This feature will be automatically +# disabled if internode encryption is enabled. +# stream_entire_sstables: true + +# Throttles entire SSTable outbound streaming file transfers on +# this node to the given total throughput in Mbps. +# Setting this value to 0 it disables throttling. +# When unset, the default is 200 Mbps or 24 MiB/s. +# entire_sstable_stream_throughput_outbound: 24MiB/s + +# Throttles entire SSTable file streaming between datacenters. +# Setting this value to 0 disables throttling for entire SSTable inter-DC file streaming. +# When unset, the default is 200 Mbps or 24 MiB/s. +# entire_sstable_inter_dc_stream_throughput_outbound: 24MiB/s + +# Throttles all outbound streaming file transfers on this node to the +# given total throughput in Mbps. This is necessary because Cassandra does +# mostly sequential IO when streaming data during bootstrap or repair, which +# can lead to saturating the network connection and degrading rpc performance. +# When unset, the default is 200 Mbps or 24 MiB/s. +# stream_throughput_outbound: 24MiB/s + +# Throttles all streaming file transfer between the datacenters, +# this setting allows users to throttle inter dc stream throughput in addition +# to throttling all network stream traffic as configured with +# stream_throughput_outbound_megabits_per_sec +# When unset, the default is 200 Mbps or 24 MiB/s. +# inter_dc_stream_throughput_outbound: 24MiB/s + +# Server side timeouts for requests. The server will return a timeout exception +# to the client if it can't complete an operation within the corresponding +# timeout. Those settings are a protection against: +# 1) having client wait on an operation that might never terminate due to some +# failures. +# 2) operations that use too much CPU/read too much data (leading to memory build +# up) by putting a limit to how long an operation will execute. +# For this reason, you should avoid putting these settings too high. In other words, +# if you are timing out requests because of underlying resource constraints then +# increasing the timeout will just cause more problems. Of course putting them too +# low is equally ill-advised since clients could get timeouts even for successful +# operations just because the timeout setting is too tight. + +# How long the coordinator should wait for read operations to complete. +# Lowest acceptable value is 10 ms. +# Min unit: ms +read_request_timeout: 5000ms +# How long the coordinator should wait for seq or index scans to complete. +# Lowest acceptable value is 10 ms. +# Min unit: ms +range_request_timeout: 10000ms +# How long the coordinator should wait for writes to complete. +# Lowest acceptable value is 10 ms. +# Min unit: ms +write_request_timeout: 2000ms +# How long the coordinator should wait for counter writes to complete. +# Lowest acceptable value is 10 ms. +# Min unit: ms +counter_write_request_timeout: 5000ms +# How long a coordinator should continue to retry a CAS operation +# that contends with other proposals for the same row. +# Lowest acceptable value is 10 ms. +# Min unit: ms +cas_contention_timeout: 1000ms +# How long the coordinator should wait for truncates to complete +# (This can be much longer, because unless auto_snapshot is disabled +# we need to flush first so we can snapshot before removing the data.) +# Lowest acceptable value is 10 ms. +# Min unit: ms +truncate_request_timeout: 60000ms +# The default timeout for other, miscellaneous operations. +# Lowest acceptable value is 10 ms. +# Min unit: ms +request_timeout: 10000ms + +# Defensive settings for protecting Cassandra from true network partitions. +# See (CASSANDRA-14358) for details. +# +# The amount of time to wait for internode tcp connections to establish. +# Min unit: ms +# internode_tcp_connect_timeout: 2000ms +# +# The amount of time unacknowledged data is allowed on a connection before we throw out the connection +# Note this is only supported on Linux + epoll, and it appears to behave oddly above a setting of 30000 +# (it takes much longer than 30s) as of Linux 4.12. If you want something that high set this to 0 +# which picks up the OS default and configure the net.ipv4.tcp_retries2 sysctl to be ~8. +# Min unit: ms +# internode_tcp_user_timeout: 30000ms + +# The amount of time unacknowledged data is allowed on a streaming connection. +# The default is 5 minutes. Increase it or set it to 0 in order to increase the timeout. +# Min unit: ms +# internode_streaming_tcp_user_timeout: 300000ms + +# Global, per-endpoint and per-connection limits imposed on messages queued for delivery to other nodes +# and waiting to be processed on arrival from other nodes in the cluster. These limits are applied to the on-wire +# size of the message being sent or received. +# +# The basic per-link limit is consumed in isolation before any endpoint or global limit is imposed. +# Each node-pair has three links: urgent, small and large. So any given node may have a maximum of +# N*3*(internode_application_send_queue_capacity+internode_application_receive_queue_capacity) +# messages queued without any coordination between them although in practice, with token-aware routing, only RF*tokens +# nodes should need to communicate with significant bandwidth. +# +# The per-endpoint limit is imposed on all messages exceeding the per-link limit, simultaneously with the global limit, +# on all links to or from a single node in the cluster. +# The global limit is imposed on all messages exceeding the per-link limit, simultaneously with the per-endpoint limit, +# on all links to or from any node in the cluster. +# +# Min unit: B +# internode_application_send_queue_capacity: 4MiB +# internode_application_send_queue_reserve_endpoint_capacity: 128MiB +# internode_application_send_queue_reserve_global_capacity: 512MiB +# internode_application_receive_queue_capacity: 4MiB +# internode_application_receive_queue_reserve_endpoint_capacity: 128MiB +# internode_application_receive_queue_reserve_global_capacity: 512MiB + + +# How long before a node logs slow queries. Select queries that take longer than +# this timeout to execute, will generate an aggregated log message, so that slow queries +# can be identified. Set this value to zero to disable slow query logging. +# Min unit: ms +slow_query_log_timeout: 500ms + +# Enable operation timeout information exchange between nodes to accurately +# measure request timeouts. If disabled, replicas will assume that requests +# were forwarded to them instantly by the coordinator, which means that +# under overload conditions we will waste that much extra time processing +# already-timed-out requests. +# +# Warning: It is generally assumed that users have setup NTP on their clusters, and that clocks are modestly in sync, +# since this is a requirement for general correctness of last write wins. +# internode_timeout: true + +# Set period for idle state control messages for earlier detection of failed streams +# This node will send a keep-alive message periodically on the streaming's control channel. +# This ensures that any eventual SocketTimeoutException will occur within 2 keep-alive cycles +# If the node cannot send, or timeouts sending, the keep-alive message on the netty control channel +# the stream session is closed. +# Default value is 300s (5 minutes), which means stalled streams +# are detected within 10 minutes +# Specify 0 to disable. +# Min unit: s +# streaming_keep_alive_period: 300s + +# Limit number of connections per host for streaming +# Increase this when you notice that joins are CPU-bound rather that network +# bound (for example a few nodes with big files). +# streaming_connections_per_host: 1 + +# Settings for stream stats tracking; used by system_views.streaming table +# How long before a stream is evicted from tracking; this impacts both historic and currently running +# streams. +# streaming_state_expires: 3d +# How much memory may be used for tracking before evicting session from tracking; once crossed +# historic and currently running streams maybe impacted. +# streaming_state_size: 40MiB +# Enable/Disable tracking of streaming stats +# streaming_stats_enabled: true + +# Allows denying configurable access (rw/rr) to operations on configured ks, table, and partitions, intended for use by +# operators to manage cluster health vs application access. See CASSANDRA-12106 and CEP-13 for more details. +# partition_denylist_enabled: false + +# denylist_writes_enabled: true +# denylist_reads_enabled: true +# denylist_range_reads_enabled: true + +# The interval at which keys in the cache for denylisting will "expire" and async refresh from the backing DB. +# Note: this serves only as a fail-safe, as the usage pattern is expected to be "mutate state, refresh cache" on any +# changes to the underlying denylist entries. See documentation for details. +# Min unit: s +# denylist_refresh: 600s + +# In the event of errors on attempting to load the denylist cache, retry on this interval. +# Min unit: s +# denylist_initial_load_retry: 5s + +# We cap the number of denylisted keys allowed per table to keep things from growing unbounded. Nodes will warn above +# this limit while allowing new denylisted keys to be inserted. Denied keys are loaded in natural query / clustering +# ordering by partition key in case of overflow. +# denylist_max_keys_per_table: 1000 + +# We cap the total number of denylisted keys allowed in the cluster to keep things from growing unbounded. +# Nodes will warn on initial cache load that there are too many keys and be direct the operator to trim down excess +# entries to within the configured limits. +# denylist_max_keys_total: 10000 + +# Since the denylist in many ways serves to protect the health of the cluster from partitions operators have identified +# as being in a bad state, we usually want more robustness than just CL.ONE on operations to/from these tables to +# ensure that these safeguards are in place. That said, we allow users to configure this if they're so inclined. +# denylist_consistency_level: QUORUM + +# phi value that must be reached for a host to be marked down. +# most users should never need to adjust this. +# phi_convict_threshold: 8 + +# endpoint_snitch -- Set this to a class that implements +# IEndpointSnitch. The snitch has two functions: +# +# - it teaches Cassandra enough about your network topology to route +# requests efficiently +# - it allows Cassandra to spread replicas around your cluster to avoid +# correlated failures. It does this by grouping machines into +# "datacenters" and "racks." Cassandra will do its best not to have +# more than one replica on the same "rack" (which may not actually +# be a physical location) +# +# CASSANDRA WILL NOT ALLOW YOU TO SWITCH TO AN INCOMPATIBLE SNITCH +# ONCE DATA IS INSERTED INTO THE CLUSTER. This would cause data loss. +# This means that if you start with the default SimpleSnitch, which +# locates every node on "rack1" in "datacenter1", your only options +# if you need to add another datacenter are GossipingPropertyFileSnitch +# (and the older PFS). From there, if you want to migrate to an +# incompatible snitch like Ec2Snitch you can do it by adding new nodes +# under Ec2Snitch (which will locate them in a new "datacenter") and +# decommissioning the old ones. +# +# Out of the box, Cassandra provides: +# +# SimpleSnitch: +# Treats Strategy order as proximity. This can improve cache +# locality when disabling read repair. Only appropriate for +# single-datacenter deployments. +# +# GossipingPropertyFileSnitch +# This should be your go-to snitch for production use. The rack +# and datacenter for the local node are defined in +# cassandra-rackdc.properties and propagated to other nodes via +# gossip. If cassandra-topology.properties exists, it is used as a +# fallback, allowing migration from the PropertyFileSnitch. +# +# PropertyFileSnitch: +# Proximity is determined by rack and data center, which are +# explicitly configured in cassandra-topology.properties. +# +# Ec2Snitch: +# Appropriate for EC2 deployments in a single Region. Loads Region +# and Availability Zone information from the EC2 API. The Region is +# treated as the datacenter, and the Availability Zone as the rack. +# Only private IPs are used, so this will not work across multiple +# Regions. +# +# Ec2MultiRegionSnitch: +# Uses public IPs as broadcast_address to allow cross-region +# connectivity. (Thus, you should set seed addresses to the public +# IP as well.) You will need to open the storage_port or +# ssl_storage_port on the public IP firewall. (For intra-Region +# traffic, Cassandra will switch to the private IP after +# establishing a connection.) +# +# RackInferringSnitch: +# Proximity is determined by rack and data center, which are +# assumed to correspond to the 3rd and 2nd octet of each node's IP +# address, respectively. Unless this happens to match your +# deployment conventions, this is best used as an example of +# writing a custom Snitch class and is provided in that spirit. +# +# You can use a custom Snitch by setting this to the full class name +# of the snitch, which will be assumed to be on your classpath. +endpoint_snitch: SimpleSnitch + +# controls how often to perform the more expensive part of host score +# calculation +# Min unit: ms +dynamic_snitch_update_interval: 100ms +# controls how often to reset all host scores, allowing a bad host to +# possibly recover +# Min unit: ms +dynamic_snitch_reset_interval: 600000ms +# if set greater than zero, this will allow +# 'pinning' of replicas to hosts in order to increase cache capacity. +# The badness threshold will control how much worse the pinned host has to be +# before the dynamic snitch will prefer other replicas over it. This is +# expressed as a double which represents a percentage. Thus, a value of +# 0.2 means Cassandra would continue to prefer the static snitch values +# until the pinned host was 20% worse than the fastest. +dynamic_snitch_badness_threshold: 1.0 + +# Configure server-to-server internode encryption +# +# JVM and netty defaults for supported SSL socket protocols and cipher suites can +# be replaced using custom encryption options. This is not recommended +# unless you have policies in place that dictate certain settings, or +# need to disable vulnerable ciphers or protocols in case the JVM cannot +# be updated. +# +# FIPS compliant settings can be configured at JVM level and should not +# involve changing encryption settings here: +# https://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/FIPS.html +# +# **NOTE** this default configuration is an insecure configuration. If you need to +# enable server-to-server encryption generate server keystores (and truststores for mutual +# authentication) per: +# http://download.oracle.com/javase/8/docs/technotes/guides/security/jsse/JSSERefGuide.html#CreateKeystore +# Then perform the following configuration changes: +# +# Step 1: Set internode_encryption= and explicitly set optional=true. Restart all nodes +# +# Step 2: Set optional=false (or remove it) and if you generated truststores and want to use mutual +# auth set require_client_auth=true. Restart all nodes +server_encryption_options: + # On outbound connections, determine which type of peers to securely connect to. + # The available options are : + # none : Do not encrypt outgoing connections + # dc : Encrypt connections to peers in other datacenters but not within datacenters + # rack : Encrypt connections to peers in other racks but not within racks + # all : Always use encrypted connections + internode_encryption: none + # When set to true, encrypted and unencrypted connections are allowed on the storage_port + # This should _only be true_ while in unencrypted or transitional operation + # optional defaults to true if internode_encryption is none + # optional: true + # If enabled, will open up an encrypted listening socket on ssl_storage_port. Should only be used + # during upgrade to 4.0; otherwise, set to false. + legacy_ssl_storage_port_enabled: false + # Set to a valid keystore if internode_encryption is dc, rack or all + keystore: conf/.keystore + keystore_password: cassandra + # Configure the way Cassandra creates SSL contexts. + # To use PEM-based key material, see org.apache.cassandra.security.PEMBasedSslContextFactory + # ssl_context_factory: + # # Must be an instance of org.apache.cassandra.security.ISslContextFactory + # class_name: org.apache.cassandra.security.DefaultSslContextFactory + # Verify peer server certificates + require_client_auth: false + # Set to a valid trustore if require_client_auth is true + truststore: conf/.truststore + truststore_password: cassandra + # Verify that the host name in the certificate matches the connected host + require_endpoint_verification: false + # More advanced defaults: + # protocol: TLS + # store_type: JKS + # cipher_suites: [ + # TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384, TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256, + # TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, + # TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, TLS_RSA_WITH_AES_128_GCM_SHA256, TLS_RSA_WITH_AES_128_CBC_SHA, + # TLS_RSA_WITH_AES_256_CBC_SHA + # ] + +# Configure client-to-server encryption. +# +# **NOTE** this default configuration is an insecure configuration. If you need to +# enable client-to-server encryption generate server keystores (and truststores for mutual +# authentication) per: +# http://download.oracle.com/javase/8/docs/technotes/guides/security/jsse/JSSERefGuide.html#CreateKeystore +# Then perform the following configuration changes: +# +# Step 1: Set enabled=true and explicitly set optional=true. Restart all nodes +# +# Step 2: Set optional=false (or remove it) and if you generated truststores and want to use mutual +# auth set require_client_auth=true. Restart all nodes +client_encryption_options: + # Enable client-to-server encryption + enabled: false + # When set to true, encrypted and unencrypted connections are allowed on the native_transport_port + # This should _only be true_ while in unencrypted or transitional operation + # optional defaults to true when enabled is false, and false when enabled is true. + # optional: true + # Set keystore and keystore_password to valid keystores if enabled is true + keystore: conf/.keystore + keystore_password: cassandra + # Configure the way Cassandra creates SSL contexts. + # To use PEM-based key material, see org.apache.cassandra.security.PEMBasedSslContextFactory + # ssl_context_factory: + # # Must be an instance of org.apache.cassandra.security.ISslContextFactory + # class_name: org.apache.cassandra.security.DefaultSslContextFactory + # Verify client certificates + require_client_auth: false + # Set trustore and truststore_password if require_client_auth is true + # truststore: conf/.truststore + # truststore_password: cassandra + # More advanced defaults: + # protocol: TLS + # store_type: JKS + # cipher_suites: [ + # TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384, TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256, + # TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, + # TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, TLS_RSA_WITH_AES_128_GCM_SHA256, TLS_RSA_WITH_AES_128_CBC_SHA, + # TLS_RSA_WITH_AES_256_CBC_SHA + # ] + +# internode_compression controls whether traffic between nodes is +# compressed. +# Can be: +# +# all +# all traffic is compressed +# +# dc +# traffic between different datacenters is compressed +# +# none +# nothing is compressed. +internode_compression: dc + +# Enable or disable tcp_nodelay for inter-dc communication. +# Disabling it will result in larger (but fewer) network packets being sent, +# reducing overhead from the TCP protocol itself, at the cost of increasing +# latency if you block for cross-datacenter responses. +inter_dc_tcp_nodelay: false + +# TTL for different trace types used during logging of the repair process. +# Min unit: s +trace_type_query_ttl: 1d +# Min unit: s +trace_type_repair_ttl: 7d + +# If unset, all GC Pauses greater than gc_log_threshold will log at +# INFO level +# UDFs (user defined functions) are disabled by default. +# As of Cassandra 3.0 there is a sandbox in place that should prevent execution of evil code. +user_defined_functions_enabled: false + +# Enables scripted UDFs (JavaScript UDFs). +# Java UDFs are always enabled, if user_defined_functions_enabled is true. +# Enable this option to be able to use UDFs with "language javascript" or any custom JSR-223 provider. +# This option has no effect, if user_defined_functions_enabled is false. +scripted_user_defined_functions_enabled: false + +# Enables encrypting data at-rest (on disk). Different key providers can be plugged in, but the default reads from +# a JCE-style keystore. A single keystore can hold multiple keys, but the one referenced by +# the "key_alias" is the only key that will be used for encrypt opertaions; previously used keys +# can still (and should!) be in the keystore and will be used on decrypt operations +# (to handle the case of key rotation). +# +# It is strongly recommended to download and install Java Cryptography Extension (JCE) +# Unlimited Strength Jurisdiction Policy Files for your version of the JDK. +# (current link: http://www.oracle.com/technetwork/java/javase/downloads/jce8-download-2133166.html) +# +# Currently, only the following file types are supported for transparent data encryption, although +# more are coming in future cassandra releases: commitlog, hints +transparent_data_encryption_options: + enabled: false + chunk_length_kb: 64 + cipher: AES/CBC/PKCS5Padding + key_alias: testing:1 + # CBC IV length for AES needs to be 16 bytes (which is also the default size) + # iv_length: 16 + key_provider: + - class_name: org.apache.cassandra.security.JKSKeyProvider + parameters: + - keystore: conf/.keystore + keystore_password: cassandra + store_type: JCEKS + key_password: cassandra + + +##################### +# SAFETY THRESHOLDS # +##################### + +# When executing a scan, within or across a partition, we need to keep the +# tombstones seen in memory so we can return them to the coordinator, which +# will use them to make sure other replicas also know about the deleted rows. +# With workloads that generate a lot of tombstones, this can cause performance +# problems and even exaust the server heap. +# (http://www.datastax.com/dev/blog/cassandra-anti-patterns-queues-and-queue-like-datasets) +# Adjust the thresholds here if you understand the dangers and want to +# scan more tombstones anyway. These thresholds may also be adjusted at runtime +# using the StorageService mbean. +tombstone_warn_threshold: 1000 +tombstone_failure_threshold: 100000 + +# Filtering and secondary index queries at read consistency levels above ONE/LOCAL_ONE use a +# mechanism called replica filtering protection to ensure that results from stale replicas do +# not violate consistency. (See CASSANDRA-8272 and CASSANDRA-15907 for more details.) This +# mechanism materializes replica results by partition on-heap at the coordinator. The more possibly +# stale results returned by the replicas, the more rows materialized during the query. +replica_filtering_protection: + # These thresholds exist to limit the damage severely out-of-date replicas can cause during these + # queries. They limit the number of rows from all replicas individual index and filtering queries + # can materialize on-heap to return correct results at the desired read consistency level. + # + # "cached_replica_rows_warn_threshold" is the per-query threshold at which a warning will be logged. + # "cached_replica_rows_fail_threshold" is the per-query threshold at which the query will fail. + # + # These thresholds may also be adjusted at runtime using the StorageService mbean. + # + # If the failure threshold is breached, it is likely that either the current page/fetch size + # is too large or one or more replicas is severely out-of-sync and in need of repair. + cached_rows_warn_threshold: 2000 + cached_rows_fail_threshold: 32000 + +# Log WARN on any multiple-partition batch size exceeding this value. 5KiB per batch by default. +# Caution should be taken on increasing the size of this threshold as it can lead to node instability. +# Min unit: KiB +batch_size_warn_threshold: 5KiB + +# Fail any multiple-partition batch exceeding this value. 50KiB (10x warn threshold) by default. +# Min unit: KiB +batch_size_fail_threshold: 50KiB + +# Log WARN on any batches not of type LOGGED than span across more partitions than this limit +unlogged_batch_across_partitions_warn_threshold: 10 + +# Log a warning when compacting partitions larger than this value +compaction_large_partition_warning_threshold: 100MiB + +# Log a warning when writing more tombstones than this value to a partition +compaction_tombstone_warning_threshold: 100000 + +# GC Pauses greater than 200 ms will be logged at INFO level +# This threshold can be adjusted to minimize logging if necessary +# Min unit: ms +# gc_log_threshold: 200ms + +# GC Pauses greater than gc_warn_threshold will be logged at WARN level +# Adjust the threshold based on your application throughput requirement. Setting to 0 +# will deactivate the feature. +# Min unit: ms +# gc_warn_threshold: 1000ms + +# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption +# early. Any value size larger than this threshold will result into marking an SSTable +# as corrupted. This should be positive and less than 2GiB. +# Min unit: MiB +# max_value_size: 256MiB + +# ** Impact on keyspace creation ** +# If replication factor is not mentioned as part of keyspace creation, default_keyspace_rf would apply. +# Changing this configuration would only take effect for keyspaces created after the change, but does not impact +# existing keyspaces created prior to the change. +# ** Impact on keyspace alter ** +# When altering a keyspace from NetworkTopologyStrategy to SimpleStrategy, default_keyspace_rf is applied if rf is not +# explicitly mentioned. +# ** Impact on system keyspaces ** +# This would also apply for any system keyspaces that need replication factor. +# A further note about system keyspaces - system_traces and system_distributed keyspaces take RF of 2 or default, +# whichever is higher, and system_auth keyspace takes RF of 1 or default, whichever is higher. +# Suggested value for use in production: 3 +# default_keyspace_rf: 1 + +# Track a metric per keyspace indicating whether replication achieved the ideal consistency +# level for writes without timing out. This is different from the consistency level requested by +# each write which may be lower in order to facilitate availability. +# ideal_consistency_level: EACH_QUORUM + +# Automatically upgrade sstables after upgrade - if there is no ordinary compaction to do, the +# oldest non-upgraded sstable will get upgraded to the latest version +# automatic_sstable_upgrade: false +# Limit the number of concurrent sstable upgrades +# max_concurrent_automatic_sstable_upgrades: 1 + +# Audit logging - Logs every incoming CQL command request, authentication to a node. See the docs +# on audit_logging for full details about the various configuration options. +audit_logging_options: + enabled: false + logger: + - class_name: BinAuditLogger + # audit_logs_dir: + # included_keyspaces: + # excluded_keyspaces: system, system_schema, system_virtual_schema + # included_categories: + # excluded_categories: + # included_users: + # excluded_users: + # roll_cycle: HOURLY + # block: true + # max_queue_weight: 268435456 # 256 MiB + # max_log_size: 17179869184 # 16 GiB + ## archive command is "/path/to/script.sh %path" where %path is replaced with the file being rolled: + # archive_command: + # max_archive_retries: 10 + + +# default options for full query logging - these can be overridden from command line when executing +# nodetool enablefullquerylog +# full_query_logging_options: + # log_dir: + # roll_cycle: HOURLY + # block: true + # max_queue_weight: 268435456 # 256 MiB + # max_log_size: 17179869184 # 16 GiB + ## archive command is "/path/to/script.sh %path" where %path is replaced with the file being rolled: + # archive_command: + ## note that enabling this allows anyone with JMX/nodetool access to run local shell commands as the user running cassandra + # allow_nodetool_archive_command: false + # max_archive_retries: 10 + +# validate tombstones on reads and compaction +# can be either "disabled", "warn" or "exception" +# corrupted_tombstone_strategy: disabled + +# Diagnostic Events # +# If enabled, diagnostic events can be helpful for troubleshooting operational issues. Emitted events contain details +# on internal state and temporal relationships across events, accessible by clients via JMX. +diagnostic_events_enabled: false + +# Use native transport TCP message coalescing. If on upgrade to 4.0 you found your throughput decreasing, and in +# particular you run an old kernel or have very fewer client connections, this option might be worth evaluating. +#native_transport_flush_in_batches_legacy: false + +# Enable tracking of repaired state of data during reads and comparison between replicas +# Mismatches between the repaired sets of replicas can be characterized as either confirmed +# or unconfirmed. In this context, unconfirmed indicates that the presence of pending repair +# sessions, unrepaired partition tombstones, or some other condition means that the disparity +# cannot be considered conclusive. Confirmed mismatches should be a trigger for investigation +# as they may be indicative of corruption or data loss. +# There are separate flags for range vs partition reads as single partition reads are only tracked +# when CL > 1 and a digest mismatch occurs. Currently, range queries don't use digests so if +# enabled for range reads, all range reads will include repaired data tracking. As this adds +# some overhead, operators may wish to disable it whilst still enabling it for partition reads +repaired_data_tracking_for_range_reads_enabled: false +repaired_data_tracking_for_partition_reads_enabled: false +# If false, only confirmed mismatches will be reported. If true, a separate metric for unconfirmed +# mismatches will also be recorded. This is to avoid potential signal:noise issues are unconfirmed +# mismatches are less actionable than confirmed ones. +report_unconfirmed_repaired_data_mismatches: false + +# Having many tables and/or keyspaces negatively affects performance of many operations in the +# cluster. When the number of tables/keyspaces in the cluster exceeds the following thresholds +# a client warning will be sent back to the user when creating a table or keyspace. +# As of cassandra 4.1, these properties are deprecated in favor of keyspaces_warn_threshold and tables_warn_threshold +# table_count_warn_threshold: 150 +# keyspace_count_warn_threshold: 40 + +# configure the read and write consistency levels for modifications to auth tables +# auth_read_consistency_level: LOCAL_QUORUM +# auth_write_consistency_level: EACH_QUORUM + +# Delays on auth resolution can lead to a thundering herd problem on reconnects; this option will enable +# warming of auth caches prior to node completing startup. See CASSANDRA-16958 +# auth_cache_warming_enabled: false + +######################### +# EXPERIMENTAL FEATURES # +######################### + +# Enables materialized view creation on this node. +# Materialized views are considered experimental and are not recommended for production use. +materialized_views_enabled: false + +# Enables SASI index creation on this node. +# SASI indexes are considered experimental and are not recommended for production use. +sasi_indexes_enabled: false + +# Enables creation of transiently replicated keyspaces on this node. +# Transient replication is experimental and is not recommended for production use. +transient_replication_enabled: false + +# Enables the used of 'ALTER ... DROP COMPACT STORAGE' statements on this node. +# 'ALTER ... DROP COMPACT STORAGE' is considered experimental and is not recommended for production use. +drop_compact_storage_enabled: false + +# Whether or not USE is allowed. This is enabled by default to avoid failure on upgrade. +#use_statements_enabled: true + +# When the client triggers a protocol exception or unknown issue (Cassandra bug) we increment +# a client metric showing this; this logic will exclude specific subnets from updating these +# metrics +#client_error_reporting_exclusions: +# subnets: +# - 127.0.0.1 +# - 127.0.0.0/31 + +# Enables read thresholds (warn/fail) across all replicas for reporting back to the client. +# See: CASSANDRA-16850 +# read_thresholds_enabled: false # scheduled to be set true in 4.2 +# When read_thresholds_enabled: true, this tracks the materialized size of a query on the +# coordinator. If coordinator_read_size_warn_threshold is defined, this will emit a warning +# to clients with details on what query triggered this as well as the size of the result set; if +# coordinator_read_size_fail_threshold is defined, this will fail the query after it +# has exceeded this threshold, returning a read error to the user. +# coordinator_read_size_warn_threshold: +# coordinator_read_size_fail_threshold: +# When read_thresholds_enabled: true, this tracks the size of the local read (as defined by +# heap size), and will warn/fail based off these thresholds; undefined disables these checks. +# local_read_size_warn_threshold: +# local_read_size_fail_threshold: +# When read_thresholds_enabled: true, this tracks the expected memory size of the RowIndexEntry +# and will warn/fail based off these thresholds; undefined disables these checks +# row_index_read_size_warn_threshold: +# row_index_read_size_fail_threshold: + +# Guardrail to warn or fail when creating more user keyspaces than threshold. +# The two thresholds default to -1 to disable. +# keyspaces_warn_threshold: -1 +# keyspaces_fail_threshold: -1 +# Guardrail to warn or fail when creating more user tables than threshold. +# The two thresholds default to -1 to disable. +# tables_warn_threshold: -1 +# tables_fail_threshold: -1 +# Guardrail to enable or disable the ability to create uncompressed tables +# uncompressed_tables_enabled: true +# Guardrail to warn or fail when creating/altering a table with more columns per table than threshold. +# The two thresholds default to -1 to disable. +# columns_per_table_warn_threshold: -1 +# columns_per_table_fail_threshold: -1 +# Guardrail to warn or fail when creating more secondary indexes per table than threshold. +# The two thresholds default to -1 to disable. +# secondary_indexes_per_table_warn_threshold: -1 +# secondary_indexes_per_table_fail_threshold: -1 +# Guardrail to enable or disable the creation of secondary indexes +# secondary_indexes_enabled: true +# Guardrail to warn or fail when creating more materialized views per table than threshold. +# The two thresholds default to -1 to disable. +# materialized_views_per_table_warn_threshold: -1 +# materialized_views_per_table_fail_threshold: -1 +# Guardrail to warn about, ignore or reject properties when creating tables. By default all properties are allowed. +# table_properties_warned: [] +# table_properties_ignored: [] +# table_properties_disallowed: [] +# Guardrail to allow/disallow user-provided timestamps. Defaults to true. +# user_timestamps_enabled: true +# Guardrail to allow/disallow GROUP BY functionality. +# group_by_enabled: true +# Guardrail to allow/disallow TRUNCATE and DROP TABLE statements +# drop_truncate_table_enabled: true +# Guardrail to warn or fail when using a page size greater than threshold. +# The two thresholds default to -1 to disable. +# page_size_warn_threshold: -1 +# page_size_fail_threshold: -1 +# Guardrail to allow/disallow list operations that require read before write, i.e. setting list element by index and +# removing list elements by either index or value. Defaults to true. +# read_before_write_list_operations_enabled: true +# Guardrail to warn or fail when querying with an IN restriction selecting more partition keys than threshold. +# The two thresholds default to -1 to disable. +# partition_keys_in_select_warn_threshold: -1 +# partition_keys_in_select_fail_threshold: -1 +# Guardrail to warn or fail when an IN query creates a cartesian product with a size exceeding threshold, +# eg. "a in (1,2,...10) and b in (1,2...10)" results in cartesian product of 100. +# The two thresholds default to -1 to disable. +# in_select_cartesian_product_warn_threshold: -1 +# in_select_cartesian_product_fail_threshold: -1 +# Guardrail to warn about or reject read consistency levels. By default, all consistency levels are allowed. +# read_consistency_levels_warned: [] +# read_consistency_levels_disallowed: [] +# Guardrail to warn about or reject write consistency levels. By default, all consistency levels are allowed. +# write_consistency_levels_warned: [] +# write_consistency_levels_disallowed: [] +# Guardrail to warn or fail when encountering larger size of collection data than threshold. +# At query time this guardrail is applied only to the collection fragment that is being writen, even though in the case +# of non-frozen collections there could be unaccounted parts of the collection on the sstables. This is done this way to +# prevent read-before-write. The guardrail is also checked at sstable write time to detect large non-frozen collections, +# although in that case exceeding the fail threshold will only log an error message, without interrupting the operation. +# The two thresholds default to null to disable. +# Min unit: B +# collection_size_warn_threshold: +# Min unit: B +# collection_size_fail_threshold: +# Guardrail to warn or fail when encountering more elements in collection than threshold. +# At query time this guardrail is applied only to the collection fragment that is being writen, even though in the case +# of non-frozen collections there could be unaccounted parts of the collection on the sstables. This is done this way to +# prevent read-before-write. The guardrail is also checked at sstable write time to detect large non-frozen collections, +# although in that case exceeding the fail threshold will only log an error message, without interrupting the operation. +# The two thresholds default to -1 to disable. +# items_per_collection_warn_threshold: -1 +# items_per_collection_fail_threshold: -1 +# Guardrail to allow/disallow querying with ALLOW FILTERING. Defaults to true. +# allow_filtering_enabled: true +# Guardrail to warn or fail when creating a user-defined-type with more fields in than threshold. +# Default -1 to disable. +# fields_per_udt_warn_threshold: -1 +# fields_per_udt_fail_threshold: -1 +# Guardrail to warn or fail when local data disk usage percentage exceeds threshold. Valid values are in [1, 100]. +# This is only used for the disks storing data directories, so it won't count any separate disks used for storing +# the commitlog, hints nor saved caches. The disk usage is the ratio between the amount of space used by the data +# directories and the addition of that same space and the remaining free space on disk. The main purpose of this +# guardrail is rejecting user writes when the disks are over the defined usage percentage, so the writes done by +# background processes such as compaction and streaming don't fail due to a full disk. The limits should be defined +# accordingly to the expected data growth due to those background processes, so for example a compaction strategy +# doubling the size of the data would require to keep the disk usage under 50%. +# The two thresholds default to -1 to disable. +# data_disk_usage_percentage_warn_threshold: -1 +# data_disk_usage_percentage_fail_threshold: -1 +# Allows defining the max disk size of the data directories when calculating thresholds for +# disk_usage_percentage_warn_threshold and disk_usage_percentage_fail_threshold, so if this is greater than zero they +# become percentages of a fixed size on disk instead of percentages of the physically available disk size. This should +# be useful when we have a large disk and we only want to use a part of it for Cassandra's data directories. +# Valid values are in [1, max available disk size of all data directories]. +# Defaults to null to disable and use the physically available disk size of data directories during calculations. +# Min unit: B +# data_disk_usage_max_disk_size: +# Guardrail to warn or fail when the minimum replication factor is lesser than threshold. +# This would also apply to system keyspaces. +# Suggested value for use in production: 2 or higher +# minimum_replication_factor_warn_threshold: -1 +# minimum_replication_factor_fail_threshold: -1 + +# Startup Checks are executed as part of Cassandra startup process, not all of them +# are configurable (so you can disable them) but these which are enumerated bellow. +# Uncomment the startup checks and configure them appropriately to cover your needs. +# +#startup_checks: +# Verifies correct ownership of attached locations on disk at startup. See CASSANDRA-16879 for more details. +# check_filesystem_ownership: +# enabled: false +# ownership_token: "sometoken" # (overriden by "CassandraOwnershipToken" system property) +# ownership_filename: ".cassandra_fs_ownership" # (overriden by "cassandra.fs_ownership_filename") +# Prevents a node from starting if snitch's data center differs from previous data center. +# check_dc: +# enabled: true # (overriden by cassandra.ignore_dc system property) +# Prevents a node from starting if snitch's rack differs from previous rack. +# check_rack: +# enabled: true # (overriden by cassandra.ignore_rack system property) +# Enable this property to fail startup if the node is down for longer than gc_grace_seconds, to potentially +# prevent data resurrection on tables with deletes. By default, this will run against all keyspaces and tables +# except the ones specified on excluded_keyspaces and excluded_tables. +# check_data_resurrection: +# enabled: false +# file where Cassandra periodically writes the last time it was known to run +# heartbeat_file: /var/lib/cassandra/data/cassandra-heartbeat +# excluded_keyspaces: # comma separated list of keyspaces to exclude from the check +# excluded_tables: # comma separated list of keyspace.table pairs to exclude from the check diff --git a/.doubanpde/scripts/cassandra/init_kv.cql b/.doubanpde/scripts/cassandra/init_kv.cql new file mode 100644 index 0000000..2f48873 --- /dev/null +++ b/.doubanpde/scripts/cassandra/init_kv.cql @@ -0,0 +1,29 @@ +-- DEFAULT USER: cassandra +-- default pass: cassandra +-- change cassandra password: +-- ALTER USER cassandra WITH PASSWORD 'verysecretpass'; + +-- create a user for doubandb +CREATE USER IF NOT EXISTS doubandb_test WITH PASSWORD 'doubandb_test'; +CREATE ROLE IF NOT EXISTS doubandb_admin; +-- Create a keyspace +CREATE KEYSPACE IF NOT EXISTS doubandb WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : '3' }; +GRANT ALL ON KEYSPACE doubandb TO doubandb_admin; +GRANT doubandb_admin TO doubandb_test; + +CREATE TYPE IF NOT EXISTS doubandb.bdbvalue ( + rtime timestamp, + flag int, + exptime int, + cas int, + body blob +); +-- Create a table +CREATE TABLE IF NOT EXISTS doubandb.kvstore ( + key blob PRIMARY KEY, + value doubandb.bdbvalue, +) WITH compression = {'class': 'ZstdCompressor'}; + +-- insert a @ value for test +--INSERT INTO doubandb.kvstore (key, value) +-- VALUES ('@', {rtime: '2023-06-21 08:01:14.247000+0000', flag: 0, exptime: 0, cas: 0, body: null}); diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 40f3895..a7de427 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -4,9 +4,8 @@ jobs: test: strategy: matrix: - go-version: [1.12.x, 1.13.x] - python-version: [2.x] - platform: [ubuntu-latest, macos-latest] + go-version: [1.20.x, 1.21.x] + platform: [ubuntu-latest] runs-on: ${{ matrix.platform }} steps: - name: Install Go @@ -14,27 +13,19 @@ jobs: with: go-version: ${{ matrix.go-version }} - - name: Install Python - uses: actions/setup-python@v1 - with: - python-version: ${{ matrix.python-version }} - - name: Checkout code uses: actions/checkout@v1 + with: + fetch-depth: 1 + path: go/src/github.com/douban/gobeansproxy - - name: Get test tools - run: go get -u -v golang.org/x/tools/cmd/goimports - - - name: Prepare Test - run: pip install --user -r tests/pip-req.txt - - name: Test run: | - export PATH=${PATH}:`go env GOPATH`/bin - diff <(goimports -d .) <(printf "") - go mod vendor - go get -u -v github.com/douban/gobeansdb + go mod tidy + go install github.com/douban/gobeansdb@latest make test + env: + GOPATH: /home/runner/work/gobeansproxy/go/ - name: Install run: make install diff --git a/.gitignore b/.gitignore index 69a8060..6cf4871 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,10 @@ vendor/ .idea/ main vendor/ + +# add by pdectl +.doubanpde/* +!.doubanpde/pde.yaml +!.doubanpde/pdectl-* +.doubanpde/pdectl-*/* +!.doubanpde/pdectl-*/Dockerfile.tpl diff --git a/Makefile b/Makefile index 95771e0..53d26a0 100644 --- a/Makefile +++ b/Makefile @@ -2,18 +2,17 @@ all:install export PYTHONPATH=. +.PHONY: test test: - ./misc/gobeansdb_server.sh start go version go test github.com/douban/gobeansproxy/config go test github.com/douban/gobeansproxy/dstore - ./misc/gobeansdb_server.sh stop template: rm -r /var/lib/gobeansproxy/templates cp -r templates /var/lib/gobeansproxy/ -pytest:install +pytest: install ./tests/run_test.sh install: diff --git a/README.md b/README.md index 8fb4dd3..26bc06f 100644 --- a/README.md +++ b/README.md @@ -4,14 +4,14 @@ A proxy for [Gobeansdb](https://github.com/douban/gobeansdb). ## Prepare -Supported Go version: > 1.11.0 +Supported Go version: > 1.20.0 ## Install ``` $ git clone http://github.com/douban/gobeansproxy.git $ cd gobeansproxy -$ go mod vendor +$ go mod tidy $ make ``` diff --git a/cassandra/cstar.go b/cassandra/cstar.go new file mode 100644 index 0000000..a8825e7 --- /dev/null +++ b/cassandra/cstar.go @@ -0,0 +1,312 @@ +package cassandra + +import ( + "context" + "fmt" + "os" + "strings" + "sync" + "time" + "unicode" + + "github.com/douban/gobeansdb/loghub" + mc "github.com/douban/gobeansdb/memcache" + "github.com/douban/gobeansproxy/config" + "github.com/gocql/gocql" + "golang.org/x/sync/errgroup" +) + +const ( + MAX_KEY_LEN = 250 +) + +var ( + logger = loghub.ErrorLogger + proxyConf = &config.Proxy + selectQ string + insertQ string + deleteQ string +) + +type CassandraStore struct { + cluster *gocql.ClusterConfig + session *gocql.Session + keyTableFinder *KeyTableFinder + staticTable bool + ClusterName string +} + +func NewCassandraStore(cstarCfg *config.CassandraStoreCfg) (*CassandraStore, error) { + cluster := gocql.NewCluster(cstarCfg.Hosts...) + if cstarCfg.Username != "" { + password := cstarCfg.Password + if cstarCfg.PasswordFile != "" { + data, err := os.ReadFile(cstarCfg.PasswordFile) + if err != nil { + return nil, err + } + password = strings.TrimSuffix(string(data), "\n") + } + + cluster.Authenticator = gocql.PasswordAuthenticator{ + Username: cstarCfg.Username, + Password: password, + } + } + cluster.Keyspace = cstarCfg.DefaultKeySpace + + switch cstarCfg.Consistency { + case "local_one": + cluster.Consistency = gocql.LocalOne + default: + cluster.Consistency = gocql.Quorum + } + + cluster.ReconnectInterval = time.Duration(cstarCfg.ReconnectIntervalSec) * time.Second + cluster.RetryPolicy = &gocql.SimpleRetryPolicy{NumRetries: cstarCfg.RetryNum} + cluster.Timeout = time.Duration(cstarCfg.CstarTimeoutMs) * time.Millisecond + cluster.ConnectTimeout = time.Duration(cstarCfg.CstarConnectTimeoutMs) * time.Millisecond + cluster.WriteTimeout = time.Duration(cstarCfg.CstarWriteTimeoutMs) * time.Millisecond + cluster.NumConns = cstarCfg.NumConns + + // cluster.SocketKeepalive = 600 * time.Second + session, err := cluster.CreateSession() + selectQ = fmt.Sprintf( + "select value from %s.%s where key = ?", + cstarCfg.DefaultKeySpace, cstarCfg.DefaultTable, + ) + insertQ = fmt.Sprintf( + "insert into %s.%s (key, value) values (?, ?)", + cstarCfg.DefaultKeySpace, cstarCfg.DefaultTable, + ) + deleteQ = fmt.Sprintf( + "delete from %s.%s where key = ?", + cstarCfg.DefaultKeySpace, cstarCfg.DefaultTable, + ) + + if err != nil { + return nil, err + } else { + cqlStore := &CassandraStore{ + cluster: cluster, + session: session, + } + + cqlStore.ClusterName, err = cqlStore.GetClusterName() + if err != nil { + return nil, err + } + ktFinder, err := NewKeyTableFinder(cstarCfg, cqlStore) + if err != nil { + return nil, err + } + cqlStore.keyTableFinder = ktFinder + cqlStore.staticTable = !cstarCfg.PrefixTableDispatcherCfg.Enable + return cqlStore, nil + } +} + +func (c *CassandraStore) Close() { + c.session.Close() +} + +func (c *CassandraStore) GetClusterName() (string, error) { + var name string + err := c.session.Query("select cluster_name from system.local").Scan(&name) + if err != nil { + return "", err + } else { + return name, nil + } +} + +func (c *CassandraStore) Get(key string) (*mc.Item, error) { + var q string + if c.staticTable { + q = selectQ + } else { + q = c.keyTableFinder.GetSqlTpl("select", key) + } + + value := &BDBValue{} + query := c.session.Query(q, key) + defer query.Release() + err := query.Scan(&value) + if err == gocql.ErrNotFound { + // https://github.com/douban/gobeansdb/blob/master/memcache/protocol.go#L499 + // just return nil for not found + return nil, nil + } + + if err != nil { + return nil, err + } else { + item, err := value.ToMCItem() + if err != nil { + return nil, err + } + return item, nil + } +} + +func (c *CassandraStore) GetMulti(keys []string, result map[string]*mc.Item) error { + // not using IN for this reason + // https://stackoverflow.com/questions/26999098/is-the-in-relation-in-cassandra-bad-for-queries + + lock := sync.Mutex{} + + ctx := context.Background() + g, ctx := errgroup.WithContext(ctx) + g.SetLimit(proxyConf.CassandraStoreCfg.MaxConnForGetm) + + for _, key := range keys { + key := key // https://golang.org/doc/faq#closures_and_goroutines + g.Go(func() error { + item, err := c.Get(key) + if item != nil { + lock.Lock() + defer lock.Unlock() + result[key] = item + } else { + if err != nil { + return err + } + // if item is nil, must be not found, we don't care + return nil + } + return nil + }) + } + + if err := g.Wait(); err != nil { + logger.Errorf("getm %s err: %s", keys, err) + } + + return nil +} + +func (c *CassandraStore) SetWithValue(key string, v *BDBValue) (ok bool, err error) { + var q string + + if c.staticTable { + q = insertQ + } else { + q = c.keyTableFinder.GetSqlTpl("insert", key) + } + + query := c.session.Query( + q, + key, + v, + ) + defer query.Release() + err = query.Exec() + + if err != nil { + logger.Debugf("Set key %s err: %s", key, err) + return false, err + } + + return true, nil +} + +func (c *CassandraStore) Set(key string, item *mc.Item) (ok bool, err error) { + var q string + + if c.staticTable { + q = insertQ + } else { + q = c.keyTableFinder.GetSqlTpl("insert", key) + } + + v := NewBDBValue(item) + query := c.session.Query( + q, + key, + v, + ) + defer query.Release() + err = query.Exec() + + if err != nil { + logger.Debugf("Set key %s err: %s", key, err) + return false, err + } + return true, nil +} + +func (c *CassandraStore) Delete(key string) (bool, error) { + var q string + + if c.staticTable { + q = deleteQ + } else { + q = c.keyTableFinder.GetSqlTpl("delete", key) + } + + query := c.session.Query( + q, + key, + ) + defer query.Release() + err := query.Exec() + + return err == nil, err +} + +func (c *CassandraStore) GetMeta(key string, extended bool) (*mc.Item, error) { + item, err := c.Get(key) + if err != nil { + return nil, err + } + + if item == nil { + return nil, err + } + + // we fake beansdb metadata + // in douban-beansdb those data used to check if records exists + var body string + if extended { + body = fmt.Sprintf( + "%d %d %d %d %d %d %d", + 1, 0, item.Flag, len(item.Body), item.ReceiveTime.Unix(), 0, 0, + ) + } else { + body = fmt.Sprintf( + "%d %d %d %d %d", + 1, 0, item.Flag, len(item.Body), item.ReceiveTime.Unix(), + ) + } + defer item.CArray.Free() + + result := new(mc.Item) + result.Body = []byte(body) + result.Flag = 0 + return result, nil +} + +func (c *CassandraStore) GetPrefixTableFinder() *KeyTableFinder { + return c.keyTableFinder +} + +func IsValidKeyString(key string) bool { + length := len(key) + if length == 0 || length > MAX_KEY_LEN { + logger.Warnf("bad key len=%d", length) + return false + } + + if key[0] <= ' ' || key[0] == '?' || key[0] == '@' { + logger.Warnf("bad key len=%d key[0]=%x", length, key[0]) + return false + } + + for _, r := range key { + if unicode.IsControl(r) || unicode.IsSpace(r) { + logger.Warnf("bad key len=%d %s", length, key) + return false + } + } + return true +} diff --git a/cassandra/logger.go b/cassandra/logger.go new file mode 100644 index 0000000..89c3695 --- /dev/null +++ b/cassandra/logger.go @@ -0,0 +1,69 @@ +package cassandra + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/douban/gobeansproxy/config" + logrus "github.com/sirupsen/logrus" + rotateLogger "gopkg.in/natefinch/lumberjack.v2" +) + +var ( + log = logrus.New() + dumpLogger = logrus.New() +) + +func setLogLevel(logLevel string) { + l, err := logrus.ParseLevel(logLevel) + if err != nil { + log.Warnf("log level no supported will use info level (passed %s)", logLevel) + } + log.SetLevel(l) + log.SetFormatter(&logrus.TextFormatter{ + DisableColors: false, + FullTimestamp: true, + }) +} + +type DualWriteErrorMgr struct { + EFile string + ELogger *logrus.Logger +} + +func NewDualWErrMgr(ecfg *config.DualWErrCfg, logger *logrus.Logger) (*DualWriteErrorMgr, error) { + if logger == nil { + logger = dumpLogger + } + + setLogLevel(ecfg.LoggerLevel) + + // check if target folder exists + if stat, err := os.Stat(ecfg.DumpToDir); err != nil || !stat.IsDir() { + return nil, fmt.Errorf("%s is not a dir or not exists", ecfg.DumpToDir) + } + + // set dump Logger + logger.SetFormatter(&logrus.JSONFormatter{}) + dumpFile := filepath.Join(ecfg.DumpToDir, ecfg.FName) + logger.SetOutput(&rotateLogger.Logger{ + Filename: dumpFile, + MaxSize: ecfg.RotateSize, + Compress: ecfg.Compress, + MaxAge: ecfg.MaxAges, + MaxBackups: ecfg.MaxBackups, + }) + + return &DualWriteErrorMgr{ + EFile: dumpFile, + ELogger: logger, + }, nil +} + +func (e *DualWriteErrorMgr) HandleErr(key, op string, err error) { + e.ELogger.WithFields(logrus.Fields{ + "key": key, + "op": op, + }).Error(err) +} diff --git a/cassandra/prefix_cfg.go b/cassandra/prefix_cfg.go new file mode 100644 index 0000000..198c662 --- /dev/null +++ b/cassandra/prefix_cfg.go @@ -0,0 +1,81 @@ +package cassandra + +import ( + "fmt" + + "github.com/douban/gobeansproxy/config" +) + +type PrefixDisPatcher interface { + LoadStaticCfg(string) (*config.CassandraStoreCfg, error) + LoadCfg(*config.CassandraStoreCfg, *CassandraStore) error + Upsert(*config.CassandraStoreCfg, map[string][]string, *CassandraStore) error + DeletePrefix(*config.CassandraStoreCfg, string, *CassandraStore) error + GetCurrentMap() map[string]string +} + +type DisPatcherCfg config.PrefixDisPatcherCfg + +func (config *DisPatcherCfg) LoadFromDB( + cqlStore *CassandraStore) (prefixKeys [][]rune, vstatus []string, err error) { + r := cqlStore.session.Query( + fmt.Sprintf( + "select prefix, value from %s.%s", + config.CfgFromCstarKeySpace, + config.CfgFromCstarTable, + ), + ).Iter().Scanner() + + for r.Next() { + var ( + prefix string + value string + ) + + err := r.Scan(&prefix, &value) + if err != nil { + return nil, nil, fmt.Errorf("load cfg from c* table err: %s", err) + } + + prefixKeys = append(prefixKeys, []rune(prefix)) + vstatus = append(vstatus, value) + } + + if err := r.Err(); err != nil { + return nil, nil, fmt.Errorf("load cfg from c* iter err: %s", err) + } + + return prefixKeys, vstatus, err +} + +func (c *DisPatcherCfg) SaveToDB(m map[string][]string, cqlStore *CassandraStore) error { + for value, prefix := range m { + for _, p := range prefix { + err := cqlStore.session.Query( + fmt.Sprintf( + "insert into %s.%s (prefix, value) values (?, ?)", + c.CfgFromCstarKeySpace, c.CfgFromCstarTable, + ), p, value, + ).Exec() + + if err != nil { + return fmt.Errorf("insert %s -> %s err: %s", p, value) + } + } + } + return nil +} + +func (c *DisPatcherCfg) DeletePrefixCfg(prefix string, cqlStore *CassandraStore) error { + err := cqlStore.session.Query( + fmt.Sprintf( + "delete from %s.%s where prefix = ?", + c.CfgFromCstarKeySpace, c.CfgFromCstarTable, + ), prefix, + ).Exec() + + if err != nil { + return fmt.Errorf("delete cfg prefix %s err: %s", prefix, err) + } + return nil +} diff --git a/cassandra/prefix_switch.go b/cassandra/prefix_switch.go new file mode 100644 index 0000000..2a6fac1 --- /dev/null +++ b/cassandra/prefix_switch.go @@ -0,0 +1,338 @@ +package cassandra + +import ( + "fmt" + "io/ioutil" + "path/filepath" + "sync" + + "github.com/acomagu/trie/v2" + "github.com/douban/gobeansproxy/config" + "gopkg.in/yaml.v3" +) + +type PrefixSwitchStatus int + +const ( + // bdb r/w c* disable + PrefixSwitchBrw PrefixSwitchStatus = 0 + // bdb r/w c* w + PrefixSwitchBrwCw PrefixSwitchStatus = 1 + // bdb w c* r/w + PrefixSwitchBwCrw PrefixSwitchStatus = 2 + // c* rw bdb disable + PrefixSwitchCrw PrefixSwitchStatus = 3 + // c* read only bdb disable + PrefixSwitchCr PrefixSwitchStatus = 4 + + statusBrw string = "br1w1cr0w0" + statusBrwCw string = "br1w1cr0w1" + statusBwCrw string = "br0w1cr1w1" + statusCrw string = "br0w0cr1w1" + statusCr string = "br0w0cr1w0" +) + +var ( + allowRWStatus = map[string]bool{ + statusBrw: true, + statusBrwCw: true, + statusBwCrw: true, + statusCrw: true, + statusCr: true, + } +) + +type PrefixSwitcher struct { + trie *trie.Tree[rune, PrefixSwitchStatus] + defaultT PrefixSwitchStatus + lock sync.RWMutex + currentTrieMap map[string]string + cstarEnabled bool +} + +func (s PrefixSwitchStatus) IsReadOnBeansdb() bool { + return s == PrefixSwitchBrw || s == PrefixSwitchBrwCw +} + +func (s PrefixSwitchStatus) IsReadOnCstar() bool { + return s == PrefixSwitchCrw || s == PrefixSwitchCr || s == PrefixSwitchBwCrw +} + +func (s PrefixSwitchStatus) IsWriteOnBeansdb() bool { + return s == PrefixSwitchBrw || s == PrefixSwitchBrwCw || s == PrefixSwitchBwCrw +} + +func (s PrefixSwitchStatus) IsWriteOnCstar() bool { + return s == PrefixSwitchCrw || s == PrefixSwitchBrwCw || s == PrefixSwitchBwCrw +} + +func strToSwitchStatus(s string) (PrefixSwitchStatus, error) { + switch s { + case statusBrw: + return PrefixSwitchBrw, nil + case statusBrwCw: + return PrefixSwitchBrwCw, nil + case statusBwCrw: + return PrefixSwitchBwCrw, nil + case statusCrw: + return PrefixSwitchCrw, nil + case statusCr: + return PrefixSwitchCr, nil + default: + return -1, fmt.Errorf("Unsupported switch type of %s", s) + } +} + +func GetPrefixSwitchTrieFromCfg( + cfg *config.CassandraStoreCfg, cqlStore *CassandraStore) ( + *trie.Tree[rune, PrefixSwitchStatus], map[string]string, error) { + if !cfg.PrefixRWDispatcherCfg.Enable { + logger.Infof("rw switcher disabled, skip init ...") + return nil, nil, nil + } + + s2k := cfg.PrefixRWDispatcherCfg.StaticCfg + + keysString := [][]rune{} + vStatus := []PrefixSwitchStatus{} + vStrStatus := []string{} + dedup := map[string]struct{}{} + + if cfg.PrefixRWDispatcherCfg.CfgFromCstarTable != "" && + cfg.PrefixRWDispatcherCfg.CfgFromCstarKeySpace != "" { + c := DisPatcherCfg(cfg.PrefixRWDispatcherCfg) + pkeys, pvalues, err := c.LoadFromDB(cqlStore) + if err != nil { + return nil, nil, err + } + + keysString = append(keysString, pkeys...) + vStrStatus = append(vStrStatus, pvalues...) + } + + if len(s2k) > 0 { + for s, kprefixs := range s2k { + for _, prefix := range kprefixs { + keysString = append(keysString, []rune(prefix)) + vStrStatus = append(vStrStatus, s) + } + } + } + + // check duplicate + if len(vStrStatus) != len(keysString) { + return nil, nil, fmt.Errorf("value list len not match with prefix list len") + } + + duplicateKeys := []string{} + loadedMap := map[string]string{} + for idx, k := range keysString { + ks := string(k) + loadedMap[ks] = vStrStatus[idx] + if _, ok := dedup[ks]; !ok { + dedup[ks] = struct{}{} + } else { + duplicateKeys = append(duplicateKeys, ks) + } + } + if len(duplicateKeys) > 0 { + return nil, nil, fmt.Errorf("prefix cfg duplicate: %v", duplicateKeys) + } + + // now init real value + for _, sv := range vStrStatus { + rv, err := strToSwitchStatus(sv) + if err != nil { + return nil, nil, fmt.Errorf("parse value %s to status err: %s", sv, err) + } + vStatus = append(vStatus, rv) + } + + logger.Infof("Loading from cfg: %v", loadedMap) + if len(keysString) == len(vStatus) && len(keysString) > 0 { + tr := trie.New[rune, PrefixSwitchStatus](keysString, vStatus) + return &tr, loadedMap, nil + } else { + return nil, loadedMap, nil + } +} + +func NewPrefixSwitcher(config *config.CassandraStoreCfg, cqlStore *CassandraStore) (*PrefixSwitcher, error) { + f := new(PrefixSwitcher) + + if !config.Enable { + f.defaultT = PrefixSwitchBrw + f.cstarEnabled = false + return f, nil + } + + prefixTrie, nowMap, err := GetPrefixSwitchTrieFromCfg(config, cqlStore) + if err != nil { + return nil, err + } + + f.trie = prefixTrie + f.cstarEnabled = true + + defaultS, err := strToSwitchStatus(config.SwitchToKeyDefault) + if err != nil { + return nil, err + } + + f.defaultT = defaultS + f.currentTrieMap = nowMap + return f, nil +} + +// use this to match longest prefix of key +// You should lock the s trie to prevent trie update +func (s *PrefixSwitcher) matchStatus(key string) PrefixSwitchStatus { + if s.trie == nil { + return s.defaultT + } + + var v PrefixSwitchStatus + var match bool + + n := *(s.trie) + + for _, c := range key { + if n = n.TraceOne(c); n == nil { + break + } + + if vv, ok := n.Terminal(); ok { + v = vv + match = true + } + } + + if match { + return v + } else { + return s.defaultT + } +} + +func (s *PrefixSwitcher) GetStatus(key string) PrefixSwitchStatus { + if !s.cstarEnabled { + return PrefixSwitchBrw + } + + s.lock.RLock() + defer s.lock.RUnlock() + return s.matchStatus(key) +} + +// check key prefix and return bdb read enable c* read enable +func (s *PrefixSwitcher) ReadEnabledOn(key string) (bool, bool) { + if !s.cstarEnabled { + return true, false + } + status := s.GetStatus(key) + return status.IsReadOnBeansdb(), status.IsReadOnCstar() +} + +// check keys prefix list and return bdb read keys and c* read keys +func (s *PrefixSwitcher) ReadEnableOnKeys(keys []string) (bkeys []string, ckeys []string) { + if !s.cstarEnabled { + bkeys = keys + return + } + + s.lock.RLock() + defer s.lock.RUnlock() + + for _, k := range keys { + status := s.matchStatus(k) + if status.IsReadOnBeansdb() { + bkeys = append(bkeys, k) + // prevent wrong status + // read can only be enable on 1 backend + continue + } + + if status.IsReadOnCstar() { + ckeys = append(ckeys, k) + } + } + return +} + +// check key prefix and return bdb write enable c* write enable +func (s *PrefixSwitcher) WriteEnabledOn(key string) (bool, bool) { + if !s.cstarEnabled { + return true, false + } + status := s.GetStatus(key) + return status.IsWriteOnBeansdb(), status.IsWriteOnCstar() +} + +func (s *PrefixSwitcher) LoadStaticCfg(cfgDir string) (*config.CassandraStoreCfg, error) { + cfg := struct { + CassandraCfg config.CassandraStoreCfg `yaml:"cassandra"` + }{} + + configF, err := ioutil.ReadFile(filepath.Join(cfgDir, "proxy.yaml")) + if err != nil { + return nil, err + } + err = yaml.Unmarshal(configF, &cfg) + if err != nil { + return nil, err + } + + return &cfg.CassandraCfg, nil +} + +func (s *PrefixSwitcher) LoadCfg(cfg *config.CassandraStoreCfg, cqlStore *CassandraStore) error { + if !cfg.Enable { + logger.Errorf("You can't use prefix switcher when c* backend disabled") + return fmt.Errorf("can't load prefix swicher cfg when cassandra backend disabled") + } + + if !cfg.PrefixRWDispatcherCfg.Enable { + logger.Errorf("You can't disable rw dispatcher online") + return fmt.Errorf("You can't disable rw dispathcer online") + } + + pTrie, nowMap, err := GetPrefixSwitchTrieFromCfg(cfg, cqlStore) + if err != nil { + logger.Errorf("reloading c* cfg err: %s", err) + return err + } + logger.Infof("reloading c* cfg for prefix switch to: %v", nowMap) + + defaultS, err := strToSwitchStatus(cfg.SwitchToKeyDefault) + if err != nil { + logger.Errorf("default switch storage parse err: %s", err) + } + logger.Infof("reloading c* cfg for prefix default store to: %s", cfg.SwitchToKeyDefault) + + + s.lock.Lock() + defer s.lock.Unlock() + s.trie = pTrie + s.defaultT = defaultS + s.currentTrieMap = nowMap + return nil +} + +func (s *PrefixSwitcher) Upsert(cfg *config.CassandraStoreCfg, data map[string][]string, cqlStore *CassandraStore) error { + for rwStatus := range data { + if _, ok := allowRWStatus[rwStatus]; !ok { + return fmt.Errorf("%s is not a validate rwstatus", rwStatus) + } + } + dispatcherCfg := DisPatcherCfg(cfg.PrefixRWDispatcherCfg) + return dispatcherCfg.SaveToDB(data, cqlStore) +} + +func (s *PrefixSwitcher) DeletePrefix(cfg *config.CassandraStoreCfg, prefix string, cqlStore *CassandraStore) error { + dispatcherCfg := DisPatcherCfg(cfg.PrefixRWDispatcherCfg) + return dispatcherCfg.DeletePrefixCfg(prefix, cqlStore) +} + +func (s *PrefixSwitcher) GetCurrentMap() map[string]string { + return s.currentTrieMap +} diff --git a/cassandra/prefix_table_finder.go b/cassandra/prefix_table_finder.go new file mode 100644 index 0000000..dfbb3b6 --- /dev/null +++ b/cassandra/prefix_table_finder.go @@ -0,0 +1,218 @@ +package cassandra + +import ( + "fmt" + "io/ioutil" + "path/filepath" + "sync" + + "github.com/acomagu/trie/v2" + "gopkg.in/yaml.v3" + + "github.com/douban/gobeansproxy/config" +) + +var ( + selectQTpl string + insertQTpl string + deleteQTpl string +) + +type KeyTableFinder struct { + trie *trie.Tree[rune, string] + defaultT string + lock sync.RWMutex + currentMap map[string]string +} + +func getTableTrieFromCfg( + ccfg *config.CassandraStoreCfg, cqlStore *CassandraStore) ( + *trie.Tree[rune, string], map[string]string, error) { + if !ccfg.PrefixTableDispatcherCfg.Enable { + logger.Infof("table switcher disabled, skip init ...") + return nil, nil, nil + } + + tcfg := ccfg.PrefixTableDispatcherCfg + s2k := tcfg.StaticCfg + + var prefixTrie *trie.Tree[rune, string] + + keysString := [][]rune{} + vStrStatus := []string{} + dedup := map[string]struct{}{} + + if tcfg.CfgFromCstarTable != "" && tcfg.CfgFromCstarKeySpace != "" { + c := DisPatcherCfg(tcfg) + pkeys, pvalues, err := c.LoadFromDB(cqlStore) + if err != nil { + return nil, nil, err + } + + keysString = append(keysString, pkeys...) + vStrStatus = append(vStrStatus, pvalues...) + } + + if len(s2k) > 0 { + for s, kprefixs := range s2k { + for _, prefix := range kprefixs { + keysString = append(keysString, []rune(prefix)) + vStrStatus = append(vStrStatus, s) + } + } + } + + // check duplicate + if len(vStrStatus) != len(keysString) { + return nil, nil, fmt.Errorf("value list len not match with prefix list len") + } + + duplicateKeys := []string{} + loadedMap := map[string]string{} + for idx, k := range keysString { + ks := string(k) + loadedMap[ks] = vStrStatus[idx] + if _, ok := dedup[ks]; !ok { + dedup[ks] = struct{}{} + } else { + duplicateKeys = append(duplicateKeys, ks) + } + } + if len(duplicateKeys) > 0 { + return nil, nil, fmt.Errorf("prefix cfg duplicate: %v", duplicateKeys) + } + logger.Infof("Loading from cfg: %v", loadedMap) + + if len(keysString) == len(vStrStatus) && len(keysString) > 0 { + tr := trie.New[rune, string](keysString, vStrStatus) + prefixTrie = &tr + } else { + prefixTrie = nil + } + + return prefixTrie, loadedMap, nil +} + +func NewKeyTableFinder(config *config.CassandraStoreCfg, cqlStore *CassandraStore) (*KeyTableFinder, error) { + f := new(KeyTableFinder) + t, nowMap, err := getTableTrieFromCfg(config, cqlStore) + if err != nil { + return nil, err + } + f.trie = t + f.defaultT = config.DefaultTable + f.currentMap = nowMap + + // init sql str + selectQTpl = fmt.Sprintf( + "select value from %s.%%s where key = ?", + config.DefaultKeySpace, + ) + insertQTpl = fmt.Sprintf( + "insert into %s.%%s (key, value) values (?, ?)", + config.DefaultKeySpace, + ) + deleteQTpl = fmt.Sprintf( + "delete from %s.%%s where key = ?", + config.DefaultKeySpace, + ) + + return f, nil +} + +func (f *KeyTableFinder) GetTableByKey(key string) string { + if f.trie == nil { + return f.defaultT + } + + var v string + var match bool + + f.lock.RLock() + defer f.lock.RUnlock() + + n := *(f.trie) + + for _, c := range key { + if n = n.TraceOne(c); n == nil { + break + } + + if vv, ok := n.Terminal(); ok { + v = vv + match = true + } + } + + if match { + return v + } else { + return f.defaultT + } +} + +func (f *KeyTableFinder) GetSqlTpl(sqlType string, key string) string { + switch sqlType { + case "select": + return fmt.Sprintf(selectQTpl, f.GetTableByKey(key)) + case "delete": + return fmt.Sprintf(deleteQTpl, f.GetTableByKey(key)) + default: + return fmt.Sprintf(insertQTpl, f.GetTableByKey(key)) + } +} + +func (f *KeyTableFinder) LoadStaticCfg(cfgDir string) (*config.CassandraStoreCfg, error) { + cfg := struct { + CassandraCfg config.CassandraStoreCfg `yaml:"cassandra"` + }{} + + configF, err := ioutil.ReadFile(filepath.Join(cfgDir, "proxy.yaml")) + if err != nil { + return nil, err + } + err = yaml.Unmarshal(configF, &cfg) + if err != nil { + return nil, err + } + + return &cfg.CassandraCfg, nil +} + +func (f *KeyTableFinder) LoadCfg(cfg *config.CassandraStoreCfg, cqlStore *CassandraStore) error { + if !cfg.PrefixTableDispatcherCfg.Enable { + return fmt.Errorf("you can't disable key table finder online") + } + pTrie, nowMap, err := getTableTrieFromCfg(cfg, cqlStore) + if err != nil { + logger.Errorf("reloading c* cfg err: %s", err) + return err + } + logger.Infof("reloading c* cfg for prefix switch to: %v", nowMap) + + defaultS := cfg.DefaultTable + logger.Infof("reloading c* cfg for prefix default store to: %s", cfg.DefaultTable) + + + f.lock.Lock() + defer f.lock.Unlock() + f.trie = pTrie + f.defaultT = defaultS + f.currentMap = nowMap + cqlStore.staticTable = !cfg.PrefixTableDispatcherCfg.Enable + return nil +} + +func (f *KeyTableFinder) Upsert(cfg *config.CassandraStoreCfg, data map[string][]string, cqlStore *CassandraStore) error { + dispatcherCfg := DisPatcherCfg(cfg.PrefixTableDispatcherCfg) + return dispatcherCfg.SaveToDB(data, cqlStore) +} + +func (f *KeyTableFinder) DeletePrefix(cfg *config.CassandraStoreCfg, prefix string, cqlStore *CassandraStore) error { + dispatcherCfg := DisPatcherCfg(cfg.PrefixTableDispatcherCfg) + return dispatcherCfg.DeletePrefixCfg(prefix, cqlStore) +} + +func (f *KeyTableFinder) GetCurrentMap() map[string]string { + return f.currentMap +} diff --git a/cassandra/prefix_table_finder_test.go b/cassandra/prefix_table_finder_test.go new file mode 100644 index 0000000..dfb47ea --- /dev/null +++ b/cassandra/prefix_table_finder_test.go @@ -0,0 +1,64 @@ +package cassandra + +import ( + "fmt" + "testing" + + "github.com/douban/gobeansproxy/config" +) + +var ( + cstarCfgTest = &config.CassandraStoreCfg{ + TableToKeyPrefix: map[string][]string{ + "a": []string{ + "/a", + "/a/b/c", + "/d/e/ffff", + "/d/f/eeee", + }, + + "and": []string{ + "/and/anding", + "/a/kkkk", + }, + }, + DefaultTable: "misc", + } +) + +func TestKeyTableFinder(t *testing.T) { + tree, err := NewKeyTableFinder(cstarCfgTest) + if err != nil { + t.Fatalf("init keytable finder err %s", err) + } + + + testData := map[string]string{ + "/a/fff/": "a", + "/and/anding/kkk/fff": "and", + "/d/e/ffff/fkljwe": "a", + "iamnoting": "misc", + "/a/kkkk/defa": "and", + } + + for k, v := range testData { + if tree.GetTableByKey(k) != v { + t.Fatalf("%s table find err, should be: %s", k, v) + } + } +} + +func BenchmarkKeyTableFinder(b *testing.B) { + f, err := NewKeyTableFinder(cstarCfgTest) + if err != nil { + b.Failed() + } + + for n := 0; n < b.N; n++ { + k := fmt.Sprintf("send_me_toMisc_%d", n) + m := f.GetTableByKey(k) + if m != "misc" { + panic(fmt.Sprintf("expect misc but got: %s, key: %s", m, k)) + } + } +} diff --git a/cassandra/udt.go b/cassandra/udt.go new file mode 100644 index 0000000..2e8974e --- /dev/null +++ b/cassandra/udt.go @@ -0,0 +1,78 @@ +package cassandra + +import ( + "fmt" + "time" + + mc "github.com/douban/gobeansdb/memcache" + "github.com/gocql/gocql" +) + +type BDBValue struct { + ReceiveTime time.Time `cql:"rtime"` + Flag int `cql:"flag"` + Exptime int `cql:"exptime"` + Cas int `cql:"cas"` + Body []byte `cql:"body"` +} + +func NewBDBValue(item *mc.Item) *BDBValue { + return &BDBValue{ + ReceiveTime: item.ReceiveTime, + Flag: item.Flag, + Exptime: item.Exptime, + Cas: item.Cas, + Body: item.CArray.Body, + } +} + +func (b *BDBValue) ToMCItem() (*mc.Item, error) { + item := &mc.Item{ + ReceiveTime: b.ReceiveTime, + Flag: b.Flag, + Exptime: b.Exptime, + Cas: b.Cas, + } + ok := item.Alloc(len(b.Body)) + if !ok { + logger.Errorf("Alloc mem err for len %d", len(b.Body)) + return nil, fmt.Errorf("alloc mem error") + } + copy(item.CArray.Body, b.Body) + return item, nil +} + +func (b BDBValue) MarshalUDT(name string, info gocql.TypeInfo) ([]byte, error) { + switch name { + case "rtime": + return gocql.Marshal(info, b.ReceiveTime) + case "flag": + return gocql.Marshal(info, b.Flag) + case "exptime": + return gocql.Marshal(info, b.Exptime) + case "cas": + return gocql.Marshal(info, b.Cas) + case "body": + return gocql.Marshal(info, b.Body) + default: + return nil, fmt.Errorf("unknown column for position: %q", name) + } +} + +func (b *BDBValue) UnmarshalUDT(name string, info gocql.TypeInfo, data []byte) error { + switch name { + case "rtime": + return gocql.Unmarshal(info, data, &b.ReceiveTime) + case "flag": + return gocql.Unmarshal(info, data, &b.Flag) + case "exptime": + return gocql.Unmarshal(info, data, &b.Exptime) + case "cas": + return gocql.Unmarshal(info, data, &b.Cas) + case "body": + return gocql.Unmarshal(info, data, &b.Body) + default: + return fmt.Errorf("unknown column for position: %q", name) + } + +} diff --git a/conf/proxy.yaml b/conf/proxy.yaml index 0d5f57b..5b75ec6 100644 --- a/conf/proxy.yaml +++ b/conf/proxy.yaml @@ -31,3 +31,61 @@ dstore: score_deviation: 10000 item_size_stats: 4096 response_time_min: 4000 + enable: true +cassandra: + enable: true + default_key_space: dbname + default_table: tablename + hosts: + - cassandra:9042 + timeout_ms: 1000 + connect_timeout_ms: 3000 + write_timeout_ms: 1000 + retry_num: 3 + reconnect_interval_sec: 180 + max_conn_for_getm: 10 + num_conns: 10 + username: "" + # plaintext password only for test usage + # please use password_file in prod env + password: "" + password_file: "" + # local_one only for test usage + # default: quorum + # consistency: "local_one" + prefix_table_dispatcher_cfg: + # if not enable will use default keyspace and table + enable: false + static: + # dispatch prefix1 key to table table_name1 + table_name1: + - "prefix1" + cfg_table: cassandra_cfg_table_name + cfg_keyspace: cassandra_cfg_keyspace + prefix_rw_dispatcher_cfg: + enable: true + static: + # dispatch prefix /test_prefix_c/ to dual write + br1w1cr0w1: + - "/test_prefix_c/" + - "/test_prefix_d/" + br0w0cr1w1: + - "test_" + cfg_table: cassandra_cfg_table_name + cfg_keyspace: cassandra_cfg_keyspace + # if not match rw dispatcher config + # will fallback to this default storage rw cfg + # br1w1cr0w0: only use beansdb as backend + # br1w1cr0w1: dual write and read from beansdb + # br0w1cr1w1: dual write and read from c* + # br0w0cr1w1: only use c* for rw backend + default_storage: "br1w1cr0w0" + # dual write error log config + dual_write_err_cfg: + dump_to_dir: /var/gobeansproxy/log/ + log_file_name: dual_write_err.log + logger_level: "INFO" + rotate_size_mb: 100 + compress: true + max_ages: 7 + max_backups: 100 diff --git a/conf/table_dispatcher_cfg.cql b/conf/table_dispatcher_cfg.cql new file mode 100644 index 0000000..181ce0b --- /dev/null +++ b/conf/table_dispatcher_cfg.cql @@ -0,0 +1,4 @@ +CREATE TABLE IF NOT EXISTS YOURKEYSPACE.YOURTABLE ( + prefix blob PRIMARY KEY, + value string, +); diff --git a/config/config.go b/config/config.go index e56e7c1..c209431 100644 --- a/config/config.go +++ b/config/config.go @@ -9,7 +9,7 @@ import ( ) const ( - Version = "v1.0.2" + Version = "v2.1.0" ) var ( @@ -21,6 +21,8 @@ type ProxyConfig struct { dbcfg.ServerConfig `yaml:"proxy,omitempty"` dbcfg.MCConfig `yaml:"mc,omitempty"` DStoreConfig `yaml:"dstore,omitempty"` + CassandraStoreCfg `yaml:"cassandra,omitempty"` + Confdir string } type DStoreConfig struct { @@ -38,6 +40,48 @@ type DStoreConfig struct { ScoreDeviation float64 `yaml:"score_deviation,omitempty"` ItemSizeStats int `yaml:"item_size_stats,omitempty"` ResponseTimeMin float64 `yaml:"response_time_min,omitempty"` + Enable bool `yaml:"enable"` + Scheduler string `yaml:"scheduler,omitempty"` +} + +type DualWErrCfg struct { + DumpToDir string `yaml:"dump_to_dir"` + FName string `yaml:"log_file_name"` + LoggerLevel string `yaml:"logger_level"` + RotateSize int `yaml:"rotate_size_mb"` + Compress bool `yaml:"compress"` + MaxAges int `yaml:"max_ages"` + MaxBackups int `yaml:"max_backups"` +} + +type PrefixDisPatcherCfg struct { + StaticCfg map[string][]string `yaml:"static"` + CfgFromCstarTable string `yaml:"cfg_table"` + CfgFromCstarKeySpace string `yaml:"cfg_keyspace"` + Enable bool `yaml:"enable"` +} + +type CassandraStoreCfg struct { + Enable bool `yaml:"enable"` + Hosts []string `yaml:"hosts"` + DefaultKeySpace string `yaml:"default_key_space"` + DefaultTable string `yaml:"default_table"` + CstarTimeoutMs int `yaml:"timeout_ms"` + CstarConnectTimeoutMs int `yaml:"connect_timeout_ms"` + CstarWriteTimeoutMs int `yaml:"write_timeout_ms"` + MaxConnForGetm int `yaml:"max_conn_for_getm"` + // ref: https://pkg.go.dev/github.com/gocql/gocql?utm_source=godoc#ClusterConfig + ReconnectIntervalSec int `yaml:"reconnect_interval_sec"` + RetryNum int `yaml:"retry_num"` + NumConns int `yaml:"num_conns"` + Username string `yaml:"username"` + Password string `yaml:"password"` + PasswordFile string `yaml:"password_file"` + Consistency string `yaml:"consistency,omitempty"` + PrefixTableDispatcherCfg PrefixDisPatcherCfg `yaml:"prefix_table_dispatcher_cfg"` + PrefixRWDispatcherCfg PrefixDisPatcherCfg `yaml:"prefix_rw_dispatcher_cfg"` + SwitchToKeyDefault string `yaml:"default_storage"` + DualWErrCfg DualWErrCfg `yaml:"dual_write_err_cfg"` } func (c *ProxyConfig) InitDefault() { @@ -63,26 +107,29 @@ func (c *ProxyConfig) Load(confdir string) { } // route - routePath := path.Join(confdir, "route.yaml") - var route *dbcfg.RouteTable - - if len(c.ZKServers) > 0 { - route, err = dbcfg.LoadRouteTableZK(routePath, c.ZKPath, c.ZKServers) - if err != nil { - log.Printf("fail to load route table from zk: %s, err: %s", c.ZKPath, err.Error()) - } - } - - if len(c.ZKServers) == 0 || err != nil { - route, err = dbcfg.LoadRouteTableLocal(routePath) + if c.DStoreConfig.Enable { + routePath := path.Join(confdir, "route.yaml") + var route *dbcfg.RouteTable + + if len(c.ZKServers) > 0 { + route, err = dbcfg.LoadRouteTableZK(routePath, c.ZKPath, c.ZKServers) + if err != nil { + log.Printf("fail to load route table from zk: %s, err: %s", c.ZKPath, err.Error()) + } + } + + if len(c.ZKServers) == 0 || err != nil { + route, err = dbcfg.LoadRouteTableLocal(routePath) + } + if err != nil { + log.Fatalf("fail to load route table: %s", err.Error()) + } + + Route = route + checkConfig(c, Route) } - if err != nil { - log.Fatalf("fail to load route table: %s", err.Error()) - } - - Route = route - checkConfig(c, Route) } + c.Confdir = confdir dbutils.InitSizesPointer(c) c.ConfigPackages() } diff --git a/config/default.go b/config/default.go index 06b1bb5..75fd627 100644 --- a/config/default.go +++ b/config/default.go @@ -31,5 +31,6 @@ var ( ScoreDeviation: 10000, // 10000 Microseconds -> 10 Millisecond ItemSizeStats: 4096, ResponseTimeMin: 4000, + Enable: true, } ) diff --git a/dbtest/conf/proxy.yaml b/dbtest/conf/proxy.yaml new file mode 100644 index 0000000..16b9bca --- /dev/null +++ b/dbtest/conf/proxy.yaml @@ -0,0 +1,34 @@ +# for doubandb proxy +# ~/go/src/github.com/dispensable/gobeansproxy/dbtest/conf +proxy: + listen: 0.0.0.0 + port: 7905 + webport: 7908 + threads: 8 + errorlog: "~/go/src/github.com/dispensable/gobeansproxy/dbtest/log/proxy-error.log" + accesslog: "~/go/src/github.com/dispensable/gobeansproxy/dbtest/log/proxy-access.log" + hostname: 127.0.0.1 + staticdir: ~/go/src/github.com/dispensable/gobeansproxy/dbtest/staticdir/ + zkserves: [] + zkpath: "/gobeansproxy/test" +mc: + max_key_len: 250 + max_req: 16 + body_max_str: 50M + body_big_str: 5M + body_c_str: 0K +dstore: + n: 3 + w: 2 + r: 1 + max_free_conns_per_host: 20 + connect_timeout_ms: 300 + write_timeout_ms: 2000 + read_timeout_ms: 2000 + dial_fail_silence_ms: 5000 + response_time_seconds: 10 + error_seconds: 10 + max_connect_errors: 10 + score_deviation: 10000 + item_size_stats: 4096 + response_time_min: 4000 diff --git a/dbtest/conf/route.yaml b/dbtest/conf/route.yaml new file mode 100644 index 0000000..9498624 --- /dev/null +++ b/dbtest/conf/route.yaml @@ -0,0 +1,10 @@ +numbucket: 16 +backup: +- "127.0.0.1:7983" +main: +- addr: 127.0.0.1:7980 + buckets: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f] +- addr: 127.0.0.1:7981 + buckets: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f] +- addr: 127.0.0.1:7982 + buckets: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, e, f] diff --git a/deb-req.d/dev.txt b/deb-req.d/dev.txt new file mode 100644 index 0000000..39f72e4 --- /dev/null +++ b/deb-req.d/dev.txt @@ -0,0 +1,3 @@ +less +python3-pip +python-is-python3 diff --git a/dstore/metrics.go b/dstore/metrics.go new file mode 100644 index 0000000..cc78227 --- /dev/null +++ b/dstore/metrics.go @@ -0,0 +1,88 @@ +package dstore + +import ( + "github.com/prometheus/client_golang/prometheus" +) + +var ( + totalReqs *prometheus.CounterVec + errorReqs *prometheus.CounterVec + rrrStoreReqs *prometheus.CounterVec + rrrStoreErr *prometheus.CounterVec + rrrStoreLag *prometheus.GaugeVec + cmdReqDurationSeconds *prometheus.HistogramVec + cmdE2EDurationSeconds *prometheus.HistogramVec + BdbProxyPromRegistry *prometheus.Registry +) + +func init() { + BdbProxyPromRegistry = prometheus.NewRegistry() + totalReqs = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "gobeansproxy", + Name: "total_reqs", + Help: "total requests counter", + }, + + []string{"cmd", "store"}, + ) + BdbProxyPromRegistry.MustRegister(totalReqs) + + errorReqs = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "gobeansproxy", + Name: "error_reqs", + Help: "error requests counter", + }, + + []string{"cmd", "store"}, + ) + BdbProxyPromRegistry.MustRegister(errorReqs) + + cmdE2EDurationSeconds = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: "gobeansproxy", + Name: "cmd_e2e_duration_seconds", + Help: "cmd e2e duration", + Buckets: []float64{ + 0.001, 0.003, 0.005, + 0.01, 0.03, 0.05, 0.07, + 0.1, 0.3, 0.5, 0.7, + 1, 2, 5, + }, + }, + + []string{"cmd"}, + ) + BdbProxyPromRegistry.MustRegister(cmdE2EDurationSeconds) + + rrrStoreReqs = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "gobeansproxy", + Name: "rrr_store_reqs", + Help: "read only rr backends req counter", + }, + []string{"host"}, + ) + BdbProxyPromRegistry.MustRegister(rrrStoreReqs) + + rrrStoreErr = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "gobeansproxy", + Name: "rrr_store_conn_err", + Help: "store connection error counter", + }, + []string{"host", "conn"}, + ) + BdbProxyPromRegistry.MustRegister(rrrStoreErr) + + rrrStoreLag = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: "gobeansproxy", + Name: "rrr_store_lag_ms", + Help: "round robin read only sch store lag", + }, + []string{"host"}, + ) + BdbProxyPromRegistry.MustRegister(rrrStoreLag) +} diff --git a/dstore/read_only_scheduler.go b/dstore/read_only_scheduler.go new file mode 100644 index 0000000..ecda362 --- /dev/null +++ b/dstore/read_only_scheduler.go @@ -0,0 +1,96 @@ +package dstore + +import ( + "fmt" + "math" + "time" + "sync/atomic" + dbcfg "github.com/douban/gobeansdb/config" +) + +type RRReadScheduler struct { + hosts []*Host + current atomic.Int32 + totalHostsI32 int32 + totalHosts int + totalHostsF64 float64 + quit bool +} + + +func NewRRReadScheduler(route *dbcfg.RouteTable) *RRReadScheduler { + rrsche := new(RRReadScheduler) + rrsche.hosts = make([]*Host, len(route.Main)) + for idx, server := range route.Main { + host := NewHost(server.Addr) + rrsche.hosts[idx] = host + } + rrsche.totalHosts = len(rrsche.hosts) + rrsche.totalHostsI32 = int32(rrsche.totalHosts) + rrsche.totalHostsF64 = float64(rrsche.totalHosts) + return rrsche +} + +func (sch *RRReadScheduler) GetHostsByKey(key string) (hosts []*Host) { + next := sch.current.Add(1) % sch.totalHostsI32 + sch.current.Store(next) + rrrStoreReqs.WithLabelValues(sch.hosts[next].Addr).Inc() + return sch.hosts[next:next+1] +} + +func (sch *RRReadScheduler) FeedbackError(host *Host, key string, startTime time.Time, errorCode float64) { + rrrStoreErr.WithLabelValues(host.Addr, fmt.Sprintf("%f", errorCode)).Inc() + return +} + + +func (sch *RRReadScheduler) FeedbackLatency(host *Host, key string, startTime time.Time, timeUsed time.Duration) { + rrrStoreLag.WithLabelValues(host.Addr).Set(float64(timeUsed.Milliseconds())) + return +} + +// route some keys to group of hosts +func (sch *RRReadScheduler) DivideKeysByBucket(keys []string) [][]string { + numKeysPer := int(math.Round(float64(len(keys)) / sch.totalHostsF64)) + rs := make([][]string, len(sch.hosts)) + maxEndIdx := len(sch.hosts) - 1 + + startIdx := 0 + partIdx := 0 + for { + endIdx := startIdx + numKeysPer + if endIdx >= len(keys) || partIdx == maxEndIdx { + endIdx = len(keys) + rs[partIdx] = keys[startIdx:endIdx] + break + } + rs[partIdx] = keys[startIdx:endIdx] + partIdx += 1 + startIdx = endIdx + } + return rs +} + +// internal status +func (sch *RRReadScheduler) Stats() map[string]map[string]float64 { + return nil +} + +// get latencies of hosts in the bucket +func (sch *RRReadScheduler) LatenciesStats() map[string]map[string][QUEUECAP]Response { + return nil +} + +// get percentage of hosts in the bucket +func (sch *RRReadScheduler) Partition() map[string]map[string]int { + return nil +} + +// return average latency and arc(percentage) +func (sch *RRReadScheduler) GetBucketInfo(bucketID int64) map[string]map[string]map[string][]Response { + return nil +} + +func (sch *RRReadScheduler) Close() { + sch.quit = true +} diff --git a/dstore/read_only_scheduler_test.go b/dstore/read_only_scheduler_test.go new file mode 100644 index 0000000..76292ed --- /dev/null +++ b/dstore/read_only_scheduler_test.go @@ -0,0 +1,43 @@ +package dstore + +import ( + "testing" + + dbcfg "github.com/douban/gobeansdb/config" + "github.com/stretchr/testify/assert" +) + + +func TestDivideKeyByHosts(t *testing.T) { + route := new(dbcfg.RouteTable) + route.Main = append( + route.Main, dbcfg.Server{Addr: "127.0.0.1:7700"}, + dbcfg.Server{Addr: "127.0.0.1:7701"}, dbcfg.Server{Addr: "127.0.0.1:7702"}, + ) + InitGlobalManualScheduler(route, 1, NoBucketsRounRobinROSchduler) + + rrKeyHostCnt := map[string]int{ + "127.0.0.1:7700": 0, + "127.0.0.1:7701": 0, + "127.0.0.1:7702": 0, + } + for i := 1; i < 100; i++ { + testKeys := []string{} + for j := 0; j < i; j++ { + hosts := globalScheduler.GetHostsByKey("j") + assert.True(t, len(hosts) == 1, "rrr scheduler only return one host for one key") + rrKeyHostCnt[hosts[0].Addr] += 1 + testKeys = append(testKeys, "") + } + result := globalScheduler.DivideKeysByBucket(testKeys) + assert.Equal(t, len(route.Main), len(result), "keys should be split part max") + totalK := 0 + for _, k := range result { + totalK += len(k) + } + assert.Equal(t, len(testKeys), totalK, "all key must parted") + assert.True(t, len(testKeys[len(testKeys)-1]) - len(testKeys[0]) < 3, "keys cap diff should less than server nums") + } + assert.True(t, rrKeyHostCnt["127.0.0.1:7700"] - rrKeyHostCnt["127.0.0.1:7701"] < 3, "rr should be balanced") + assert.True(t, rrKeyHostCnt["127.0.0.1:7700"] - rrKeyHostCnt["127.0.0.1:7702"] < 3, "rr should be balanced") +} diff --git a/dstore/scheduler.go b/dstore/scheduler.go index 372e8ba..ab87bab 100644 --- a/dstore/scheduler.go +++ b/dstore/scheduler.go @@ -14,6 +14,8 @@ const ( FeedbackNonConnectErrDelete = -10 FeedbackConnectErrDefault = -2 FeedbackNonConnectErrDefault = -5 + NoBucketsRounRobinROSchduler = "no_buckets_rro" + BucketsManualSchduler = "buckets_manual" ) var ( @@ -73,8 +75,21 @@ func GetScheduler() Scheduler { return globalScheduler } -func InitGlobalManualScheduler(route *dbcfg.RouteTable, n int) { - globalScheduler = NewManualScheduler(route, n) +func InitGlobalManualScheduler(route *dbcfg.RouteTable, n int, schedulerName string) { + switch schedulerName { + case BucketsManualSchduler, "": + globalScheduler = NewManualScheduler(route, n) + case NoBucketsRounRobinROSchduler: + if n != 1 { + logger.Fatalf("rro readonly scheduler can only use one replica, now: %d", n) + } + globalScheduler = NewRRReadScheduler(route) + default: + logger.Fatalf( + "Unsupported scheduler, must be: %s or %s", + BucketsManualSchduler, NoBucketsRounRobinROSchduler, + ) + } } func NewManualScheduler(route *dbcfg.RouteTable, n int) *ManualScheduler { @@ -246,7 +261,6 @@ func (sch *ManualScheduler) tryRebalance() { } func (sch *ManualScheduler) checkFailsForBucket(bucket *Bucket) { - hosts := bucket.hostsList for _, hostBucket := range hosts { if item, err := hostBucket.host.Get("@"); err == nil { diff --git a/dstore/store.go b/dstore/store.go index edc2aac..639cdd8 100644 --- a/dstore/store.go +++ b/dstore/store.go @@ -2,31 +2,81 @@ package dstore import ( "errors" + "fmt" "sync" "time" "github.com/douban/gobeansdb/cmem" "github.com/douban/gobeansdb/loghub" mc "github.com/douban/gobeansdb/memcache" + "github.com/prometheus/client_golang/prometheus" + "github.com/douban/gobeansproxy/cassandra" "github.com/douban/gobeansproxy/config" ) var ( logger = loghub.ErrorLogger proxyConf = &config.Proxy -) - -var ( // ErrWriteFailed 表示成功写入的节点数小于 StorageClient.W ErrWriteFailed = errors.New("write failed") + PrefixStorageSwitcher *cassandra.PrefixSwitcher + PrefixTableFinder *cassandra.KeyTableFinder + CqlStore *cassandra.CassandraStore ) type Storage struct { + cstar *cassandra.CassandraStore + PSwitcher *cassandra.PrefixSwitcher + dualWErrHandler *cassandra.DualWriteErrorMgr +} + +func (s *Storage) InitStorageEngine(pCfg *config.ProxyConfig) error { + if !pCfg.CassandraStoreCfg.Enable && !pCfg.DStoreConfig.Enable { + return fmt.Errorf("You must enable at least one store engine") + } + + if pCfg.CassandraStoreCfg.Enable { + cstar, err := cassandra.NewCassandraStore(&proxyConf.CassandraStoreCfg) + if err != nil { + return err + } + + s.cstar = cstar + + switcher, err := cassandra.NewPrefixSwitcher(&proxyConf.CassandraStoreCfg, cstar) + if err != nil { + return err + } + s.PSwitcher = switcher + PrefixStorageSwitcher = switcher + PrefixTableFinder = cstar.GetPrefixTableFinder() + CqlStore = cstar + dualWErrCfg := pCfg.CassandraStoreCfg.DualWErrCfg + dualWErrHandler, err := cassandra.NewDualWErrMgr( + &dualWErrCfg, + nil, + ) + if err != nil { + return err + } + s.dualWErrHandler = dualWErrHandler + logger.Infof("dual write log send to: %s", s.dualWErrHandler.EFile) + } else { + switcher, err := cassandra.NewPrefixSwitcher(&proxyConf.CassandraStoreCfg, nil) + if err != nil { + return err + } + s.PSwitcher = switcher + } + return nil } func (s *Storage) Client() mc.StorageClient { - return NewStorageClient(proxyConf.N, proxyConf.W, proxyConf.R) + return NewStorageClient( + proxyConf.N, proxyConf.W, proxyConf.R, + s.cstar, s.PSwitcher, s.dualWErrHandler, + ) } // client for gobeansdb @@ -40,17 +90,46 @@ type StorageClient struct { // reinit by GetScheduler() for each request, i.e. entry of each puplic method sched Scheduler + + // cassandra + cstar *cassandra.CassandraStore + + // prefix storage switcher + pswitcher *cassandra.PrefixSwitcher + + // dual write error handler + dualWErrHandler *cassandra.DualWriteErrorMgr + + // proxy hostname cstar cluster name + proxyHostName, cstarClusterName string } -func NewStorageClient(n int, w int, r int) (c *StorageClient) { +func NewStorageClient(n int, w int, r int, + cstar *cassandra.CassandraStore, + pStoreSwitcher *cassandra.PrefixSwitcher, + dualEHandler *cassandra.DualWriteErrorMgr, +) (c *StorageClient) { c = new(StorageClient) c.N = n c.W = w c.R = r + c.cstar = cstar + c.pswitcher = pStoreSwitcher + c.dualWErrHandler = dualEHandler + c.proxyHostName = fmt.Sprintf("%s:%d", proxyConf.Hostname, proxyConf.Port) + if c.cstar != nil { + // for user disabled cstar store + c.cstarClusterName = c.cstar.ClusterName + } return c } func (c *StorageClient) GetSuccessedTargets() []string { + if len(c.SuccessedTargets) == 0 { + c.SuccessedTargets = append(c.SuccessedTargets, "NoWhere", c.proxyHostName) + } else { + c.SuccessedTargets = append(c.SuccessedTargets, c.proxyHostName) + } return c.SuccessedTargets } @@ -60,40 +139,84 @@ func (c *StorageClient) Clean() { } func (c *StorageClient) Get(key string) (item *mc.Item, err error) { - c.sched = GetScheduler() + timer := prometheus.NewTimer( + cmdE2EDurationSeconds.WithLabelValues("get"), + ) + defer timer.ObserveDuration() - hosts := c.sched.GetHostsByKey(key) - cnt := 0 - for _, host := range hosts[:c.N] { - start := time.Now() - item, err = host.Get(key) - if err == nil { - cnt++ - if item != nil { - if item.Cap < proxyConf.ItemSizeStats { - c.sched.FeedbackLatency(host, key, start, time.Now().Sub(start)) + bReadEnable, cReadEnable := c.pswitcher.ReadEnabledOn(key) + + if bReadEnable { + totalReqs.WithLabelValues("get", "beansdb").Inc() + c.sched = GetScheduler() + + hosts := c.sched.GetHostsByKey(key) + cnt := 0 + for _, host := range hosts[:c.N] { + start := time.Now() + item, err = host.Get(key) + if err == nil { + cnt++ + if item != nil { + if item.Cap < proxyConf.ItemSizeStats { + c.sched.FeedbackLatency(host, key, start, time.Now().Sub(start)) + } + c.SuccessedTargets = []string{host.Addr} + return + } else { + c.SuccessedTargets = append(c.SuccessedTargets, host.Addr) } - c.SuccessedTargets = []string{host.Addr} - return - } else { - c.SuccessedTargets = append(c.SuccessedTargets, host.Addr) - } - } else { - if isWaitForRetry(err) { - c.sched.FeedbackError(host, key, start, FeedbackConnectErrDefault) } else { - c.sched.FeedbackError(host, key, start, FeedbackNonConnectErrDefault) + if isWaitForRetry(err) { + c.sched.FeedbackError(host, key, start, FeedbackConnectErrDefault) + } else { + c.sched.FeedbackError(host, key, start, FeedbackNonConnectErrDefault) + } } } + + if cnt >= c.R { + // because hosts are sorted + err = nil + } + + // here is a failure exit + return } - if cnt >= c.R { - // because hosts are sorted - err = nil + if cReadEnable { + totalReqs.WithLabelValues("get", "cstar").Inc() + + switch key[0] { + // ref: https://github.com/douban/gobeansdb/wiki/protocol-extention + // ref: https://github.com/douban/gobeansdb/blob/d06c2ff9fcd4f381c54b260ec64186c93d1a024f/gobeansdb/store.go#L157 + case '?': + extended := false + if len(key) > 1 { + if key[1] == '?' { + extended = true + key = key[2:] + } else { + key = key[1:] + } + if !cassandra.IsValidKeyString(key) { + return nil, nil + } + } + item, err = c.cstar.GetMeta(key, extended) + default: + item, err = c.cstar.Get(key) + } + + if err == nil { + c.SuccessedTargets = append(c.SuccessedTargets, c.cstarClusterName) + } else { + errorReqs.WithLabelValues("get", "cstar").Inc() + } + return item, err } - // here is a failure exit - return + return nil, fmt.Errorf("You must enable at least one read engine for get") } func (c *StorageClient) getMulti(keys []string) (rs map[string]*mc.Item, targets []string, err error) { @@ -146,63 +269,138 @@ func (c *StorageClient) getMulti(keys []string) (rs map[string]*mc.Item, targets } func (c *StorageClient) GetMulti(keys []string) (rs map[string]*mc.Item, err error) { - c.sched = GetScheduler() - var lock sync.Mutex - rs = make(map[string]*mc.Item, len(keys)) + timer := prometheus.NewTimer( + cmdE2EDurationSeconds.WithLabelValues("getm"), + ) + defer timer.ObserveDuration() - gs := c.sched.DivideKeysByBucket(keys) - reply := make(chan bool, len(gs)) - for _, ks := range gs { - if len(ks) > 0 { - go func(keys []string) { - r, t, e := c.getMulti(keys) - if e != nil { - err = e - } else { - for k, v := range r { - lock.Lock() - rs[k] = v - c.SuccessedTargets = append(c.SuccessedTargets, t...) - lock.Unlock() + bkeys, ckeys := c.pswitcher.ReadEnableOnKeys(keys) + rs = make(map[string]*mc.Item, len(keys)) + + if len(bkeys) > 0 { + totalReqs.WithLabelValues("getm", "beansdb").Inc() + c.sched = GetScheduler() + var lock sync.Mutex + + gs := c.sched.DivideKeysByBucket(bkeys) + reply := make(chan bool, len(gs)) + for _, ks := range gs { + if len(ks) > 0 { + go func(gkeys []string) { + r, t, e := c.getMulti(gkeys) + if e != nil { + err = e + } else { + for k, v := range r { + lock.Lock() + // k should ALWAYS not exist in rs + // otherwise there would be a memory leak + rs[k] = v + c.SuccessedTargets = append(c.SuccessedTargets, t...) + lock.Unlock() + } } - } + reply <- true + }(ks) + } else { reply <- true - }(ks) - } else { - reply <- true + } + } + + // wait for complete + for range gs { + <-reply + } + + // keys all find in bdb + if len(ckeys) == 0 { + return } } - // wait for complete - for range gs { - <-reply + if len(ckeys) > 0 && err == nil { + totalReqs.WithLabelValues("getm", "cstar").Inc() + err = c.cstar.GetMulti(ckeys, rs) + if err == nil { + c.SuccessedTargets = append(c.SuccessedTargets, c.cstarClusterName) + } } return } func (c *StorageClient) Set(key string, item *mc.Item, noreply bool) (ok bool, err error) { - c.sched = GetScheduler() - hosts := c.sched.GetHostsByKey(key) - ok = false - err = ErrWriteFailed - if len(hosts) >= c.N { - mainSuc, mainTargets := c.setConcurrently(hosts[:c.N], key, item, noreply) - if mainSuc >= c.W { - ok = true - err = nil - c.SuccessedTargets = mainTargets - } else { - backupSuc, backupTargets := c.setConcurrently(hosts[c.N:], key, item, noreply) - if mainSuc+backupSuc >= c.W { + defer item.Free() + timer := prometheus.NewTimer( + cmdE2EDurationSeconds.WithLabelValues("set"), + ) + defer timer.ObserveDuration() + + rwStatus := c.pswitcher.GetStatus(key) + bWriteEnable, cWriteEnable := rwStatus.IsWriteOnBeansdb(), rwStatus.IsWriteOnCstar() + + if bWriteEnable { + totalReqs.WithLabelValues("set", "beansdb").Inc() + + c.sched = GetScheduler() + hosts := c.sched.GetHostsByKey(key) + ok = false + err = ErrWriteFailed + if len(hosts) >= c.N { + mainSuc, mainTargets := c.setConcurrently(hosts[:c.N], key, item, noreply) + if mainSuc >= c.W { ok = true err = nil - c.SuccessedTargets = append(mainTargets, backupTargets...) + c.SuccessedTargets = mainTargets + } else { + backupSuc, backupTargets := c.setConcurrently(hosts[c.N:], key, item, noreply) + if mainSuc+backupSuc >= c.W { + ok = true + err = nil + c.SuccessedTargets = append(mainTargets, backupTargets...) + } } } + cmem.DBRL.SetData.SubSizeAndCount(item.Cap) + if err != nil { + errorReqs.WithLabelValues("set", "beansdb").Inc() + } } - cmem.DBRL.SetData.SubSizeAndCount(item.Cap) - item.Free() - return + + if cWriteEnable { + // beansdb write error cstar just return + if bWriteEnable && err != nil { + return ok, err + } + + totalReqs.WithLabelValues("set", "cstar").Inc() + + // beansdb write succ means this is a legit key + if !bWriteEnable && !cassandra.IsValidKeyString(key) { + return false, fmt.Errorf("Key format invalid") + } + + cok, cerr := c.cstar.Set(key, item) + if cerr != nil { + errorReqs.WithLabelValues("set", "cstar").Inc() + logger.Errorf("set on c* failed: %s, key: %s", cerr, key) + + // we only care c* dual write error only when bdb read enabled + // brwcw -> return bdb result c* error just add to err log + // bwcrw -> return c* error as final error, if bdb write err, c* write will not exec + if bWriteEnable { + errorReqs.WithLabelValues("set", "bcdual").Inc() + c.dualWErrHandler.HandleErr(key, "set", cerr) + + if rwStatus.IsReadOnBeansdb() { + return ok, err + } + } + } + c.SuccessedTargets = append(c.SuccessedTargets, c.cstarClusterName) + return cok, cerr + } + + return ok, err } // cmdReturnType 只在 setConcurrently 函数中使用, @@ -244,6 +442,9 @@ func (c *StorageClient) setConcurrently( } func (c *StorageClient) Append(key string, value []byte) (ok bool, err error) { + if proxyConf.CassandraStoreCfg.Enable { + return false, fmt.Errorf("cstar store do not support append") + } // NOTE: gobeansdb now do not support `append`, this is not tested. c.sched = GetScheduler() suc := 0 @@ -274,6 +475,9 @@ func (c *StorageClient) Append(key string, value []byte) (ok bool, err error) { // NOTE: Incr command may has consistency problem // link: http://github.com/douban/gobeansproxy/issues/7 func (c *StorageClient) Incr(key string, value int) (result int, err error) { + if proxyConf.CassandraStoreCfg.Enable { + return 0, fmt.Errorf("cstar store do not support incr") + } c.sched = GetScheduler() suc := 0 for i, host := range c.sched.GetHostsByKey(key) { @@ -308,42 +512,85 @@ func (c *StorageClient) Incr(key string, value int) (result int, err error) { // TODO: 弄清楚为什么 delete 不遵循 NWR 规则 func (c *StorageClient) Delete(key string) (flag bool, err error) { - c.sched = GetScheduler() - suc := 0 - errCnt := 0 - lastErrStr := "" - failedHosts := make([]string, 0, 2) - for i, host := range c.sched.GetHostsByKey(key) { - start := time.Now() - ok, err := host.Delete(key) - if ok { - suc++ - c.SuccessedTargets = append(c.SuccessedTargets, host.Addr) - } else if err != nil { - errCnt++ - lastErrStr = err.Error() - failedHosts = append(failedHosts, host.Addr) - if i >= c.N { - continue + timer := prometheus.NewTimer( + cmdE2EDurationSeconds.WithLabelValues("del"), + ) + defer timer.ObserveDuration() + + rwStatus := c.pswitcher.GetStatus(key) + bWriteEnable, cWriteEnable := rwStatus.IsWriteOnBeansdb(), rwStatus.IsWriteOnCstar() + + if bWriteEnable { + totalReqs.WithLabelValues("del", "beansdb").Inc() + c.sched = GetScheduler() + suc := 0 + errCnt := 0 + lastErrStr := "" + failedHosts := make([]string, 0, 2) + for i, host := range c.sched.GetHostsByKey(key) { + start := time.Now() + ok, err := host.Delete(key) + if ok { + suc++ + c.SuccessedTargets = append(c.SuccessedTargets, host.Addr) + } else if err != nil { + errCnt++ + lastErrStr = err.Error() + failedHosts = append(failedHosts, host.Addr) + if i >= c.N { + continue + } + if !isWaitForRetry(err) { + c.sched.FeedbackError(host, key, start, FeedbackNonConnectErrDelete) + } } - if !isWaitForRetry(err) { - c.sched.FeedbackError(host, key, start, FeedbackNonConnectErrDelete) + + // TODO: 弄清楚这里为什么不是 suc > c.W + if suc >= c.N { + break } } - - // TODO: 弄清楚这里为什么不是 suc > c.W - if suc >= c.N { - break + if errCnt > 0 { + logger.Warnf("key: %s was delete failed in %v, and the last error is %s", + key, failedHosts, lastErrStr) + } + if errCnt < 2 { + err = nil + } + flag = suc > 0 + if err != nil { + errorReqs.WithLabelValues("del", "beansdb").Inc() } } - if errCnt > 0 { - logger.Warnf("key: %s was delete failed in %v, and the last error is %s", - key, failedHosts, lastErrStr) - } - if errCnt < 2 { - err = nil + + if cWriteEnable { + // when dual write we follow the beansdb principle + // if bdb write failed we just return and wait for + // client to retry that + if bWriteEnable && err != nil { + return + } + totalReqs.WithLabelValues("del", "cstar").Inc() + if !cassandra.IsValidKeyString(key) { + return false, fmt.Errorf("invalide key format") + } + cflag, cerr := c.cstar.Delete(key) + if cerr != nil { + errorReqs.WithLabelValues("del", "cstar").Inc() + logger.Errorf("del on c* failed: %s, key: %s", cerr, key) + if bWriteEnable { + errorReqs.WithLabelValues("del", "bcdual").Inc() + c.dualWErrHandler.HandleErr(key, "del", cerr) + + if rwStatus.IsReadOnBeansdb() { + return + } + } + } + c.SuccessedTargets = append(c.SuccessedTargets, c.cstarClusterName) + return cflag, cerr } - flag = suc > 0 + return } @@ -352,6 +599,9 @@ func (c *StorageClient) Len() int { } func (c *StorageClient) Close() { + if proxyConf.CassandraStoreCfg.Enable { + c.cstar.Close() + } return } diff --git a/dstore/store_test.go b/dstore/store_test.go index 0d58da3..8052810 100644 --- a/dstore/store_test.go +++ b/dstore/store_test.go @@ -1,17 +1,129 @@ package dstore import ( + "errors" + "flag" "fmt" + "io/ioutil" + "net" + "os" + "os/exec" + "os/user" "path" + "path/filepath" "testing" + "time" + dbcfg "github.com/douban/gobeansdb/gobeansdb" mc "github.com/douban/gobeansdb/memcache" + yaml "gopkg.in/yaml.v2" "github.com/douban/gobeansproxy/config" "github.com/douban/gobeansproxy/utils" "github.com/stretchr/testify/assert" ) +var testDataDir = flag.String("testDataDir", "/tmp/gobeansdbproxy/bdb/data/", "this dir will be used by gobeansdb and proxy") + + +func setupSuite(tb testing.TB) func(tb testing.TB) { + user, err := user.Current() + if err != nil { + tb.Fatalf("get username err: %s", err) + } + gopath := os.Getenv("GOPATH") + gobeansdbBin := filepath.Join(gopath, "bin", "gobeansdb") + + if _, err := os.Stat(gobeansdbBin); errors.Is(err, os.ErrNotExist) { + tb.Fatalf("gobeansdb binary not exists, %s", gobeansdbBin) + } + + projDir := utils.GetProjectHomeDir() + + allGobeansdb := []*exec.Cmd{} + for _, p := range []string{"57980", "57981", "57982", "57983"} { + conn, _ := net.DialTimeout("tcp", net.JoinHostPort("localhost", p), time.Second) + if conn != nil { + conn.Close() + tb.Logf("%s port already listening ignore start ...", p) + continue + } + + // we modify config when developer run test without container + gobeansdbCfg := fmt.Sprintf("%s/.doubanpde/scripts/bdb/gobeansproxy/%s/conf/", projDir, p) + cfgParsed := dbcfg.DBConfig{} + yfile, err := ioutil.ReadFile(filepath.Join(gobeansdbCfg, "global.yaml")) + if err != nil { + tb.Fatal(err) + } + err = yaml.Unmarshal(yfile, &cfgParsed) + if err != nil { + tb.Fatalf("load cfg %s err: %s", gobeansdbCfg, err) + } + dataPath := filepath.Join(*testDataDir, p, user.Username, "data") + logPath := filepath.Join(*testDataDir, p, user.Username, "log") + for _, pp := range []string{dataPath, logPath} { + err = os.MkdirAll(pp, os.ModePerm) + if err != nil { + tb.Fatalf("create dir %s err: %s", pp, err) + } + } + cfgParsed.ServerConfig.AccessLog = filepath.Join(logPath, "access.log") + cfgParsed.ServerConfig.ErrorLog = filepath.Join(logPath, "error.log") + cfgParsed.HStoreConfig.DBLocalConfig.Home = dataPath + gobeansdbTestCfg := fmt.Sprintf("%s/.doubanpde/scripts/bdb/gobeansproxy/%s/testconf/", projDir, p) + err = os.MkdirAll(gobeansdbTestCfg, os.ModePerm) + if err != nil { + tb.Fatalf("create dir %s err: %s", gobeansdbTestCfg, err) + } + c, err := yaml.Marshal(cfgParsed) + if err != nil { + tb.Fatalf("marshal cfg err: %s", err) + } + + dbGlobalCfg := filepath.Join(gobeansdbTestCfg, "global.yaml") + f, err := os.OpenFile(dbGlobalCfg, os.O_RDWR|os.O_CREATE|os.O_TRUNC, 0755) + if err != nil { + tb.Fatal(err) + } + defer f.Close() + _, err = f.Write(c) + if err != nil { + tb.Fatal(err) + } + routeCfg := filepath.Join(gobeansdbTestCfg, "route.yaml") + rcfg, err := ioutil.ReadFile(filepath.Join(gobeansdbCfg, "route.yaml")) + if err != nil { + tb.Fatal(err) + } + err = ioutil.WriteFile(routeCfg, rcfg, 0644) + if err != nil { + tb.Fatal(err) + } + + cmd := exec.Command( + gobeansdbBin, + "-confdir", + gobeansdbTestCfg, + ) + if err := cmd.Start(); err != nil { + tb.Fatalf("failed to start %s gobeansdb: %s", p, err) + } + tb.Logf("start %s with pid: %d", cmd, cmd.Process.Pid) + allGobeansdb = append(allGobeansdb, cmd) + } + // wait some time let the server started + time.Sleep(time.Second * 5) + + return func(tb testing.TB) { + for _, execCmd := range allGobeansdb { + if err := execCmd.Process.Kill(); err != nil { + tb.Fatalf("failed to kill process %s: %s", execCmd, err) + } + } + } +} + func testClientSet(t *testing.T, c mc.StorageClient, key string, val []byte) { assert := assert.New(t) flag := 2 @@ -28,7 +140,7 @@ func testClientSet(t *testing.T, c mc.StorageClient, key string, val []byte) { assert.Equal(val, v.Body) assert.Equal(flag, v.Flag) - assert.Equal(1, len(getHosts)) + assert.Equal(2, len(getHosts)) assert.True(hasIntersection(setHosts, getHosts)) } @@ -70,9 +182,8 @@ func testStoreClient(t *testing.T, c mc.StorageClient) { r, _ := c.Get(key1) assert.Nil(r) - assert.True(len(c.GetSuccessedTargets()) > 0) + assert.True(len(c.GetSuccessedTargets()) > 2) c.Clean() - assert.True(len(c.GetSuccessedTargets()) == 0) // set key2 := "/test/client/2" @@ -108,13 +219,6 @@ func testStoreClient(t *testing.T, c mc.StorageClient) { val4 := make([]byte, 1024*1000) testClientSet(t, c, key4, val4) - // incr - key5 := "/test/client/5" - v5, _ := c.Incr(key5, 3) - assert.Equal(3, v5) - v5, _ = c.Incr(key5, 5) - assert.Equal(8, v5) - // delete key6 := "/test/client/6" val6 := []byte("value 6") @@ -125,14 +229,19 @@ func testStoreClient(t *testing.T, c mc.StorageClient) { assert.Nil(v6) } -func TestStore(t *testing.T) { +func TestDStoreOnly(t *testing.T) { + teardown := setupSuite(t) + defer teardown(t) + homeDir := utils.GetProjectHomeDir() - confdir := path.Join(homeDir, "conf") + confdir := path.Join(homeDir, ".doubanpde", "scripts", "bdb", "gobeansproxy", "dstore-only", "conf") proxyConf := &config.Proxy proxyConf.Load(confdir) - InitGlobalManualScheduler(config.Route, proxyConf.N) - c := NewStorageClient(proxyConf.N, proxyConf.W, proxyConf.R) + InitGlobalManualScheduler(config.Route, proxyConf.N, BucketsManualSchduler) + storage := new(Storage) + storage.InitStorageEngine(proxyConf) + c := NewStorageClient(proxyConf.N, proxyConf.W, proxyConf.R, storage.cstar, storage.PSwitcher, storage.dualWErrHandler) testStoreClient(t, c) } diff --git a/go.mod b/go.mod index a74df06..9a64411 100644 --- a/go.mod +++ b/go.mod @@ -1,11 +1,46 @@ module github.com/douban/gobeansproxy require ( - github.com/davecgh/go-spew v1.1.0 // indirect - github.com/douban/gobeansdb v1.1.2 + github.com/acomagu/trie/v2 v2.0.0 + github.com/douban/gobeansdb v1.1.3 + github.com/gocql/gocql v1.5.2 + github.com/sirupsen/logrus v1.9.3 + github.com/stretchr/testify v1.8.0 + gopkg.in/natefinch/lumberjack.v2 v2.2.1 + gopkg.in/yaml.v2 v2.4.0 + gopkg.in/yaml.v3 v3.0.1 +) + +require ( + github.com/beorn7/perks v1.0.1 // indirect + github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/golang/protobuf v1.5.3 // indirect + github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed // indirect + github.com/kr/text v0.2.0 // indirect + github.com/matttproud/golang_protobuf_extensions v1.0.4 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/stretchr/testify v1.2.2 - gopkg.in/yaml.v2 v2.2.1 + github.com/prometheus/client_model v0.3.0 // indirect + github.com/prometheus/common v0.42.0 // indirect + github.com/prometheus/procfs v0.10.1 // indirect + github.com/rogpeppe/go-internal v1.11.0 // indirect + github.com/samuel/go-zookeeper v0.0.0-20190923202752-2cc03de413da // indirect + github.com/spaolacci/murmur3 v1.1.0 // indirect + golang.org/x/exp v0.0.0-20230310171629-522b1b587ee0 // indirect + golang.org/x/sys v0.10.0 // indirect + google.golang.org/protobuf v1.30.0 // indirect + gopkg.in/inf.v0 v0.9.1 // indirect +) + +require ( + github.com/golang/snappy v0.0.4 // indirect + github.com/prometheus/client_golang v1.16.0 + golang.org/x/sync v0.3.0 ) -go 1.13 +go 1.20 + +// for lcoal dev +// replace github.com/douban/gobeansdb => ../gobeansdb + +// replace github.com/douban/gobeansproxy => ../gobeansproxy diff --git a/go.sum b/go.sum index 5446512..fd3d0b5 100644 --- a/go.sum +++ b/go.sum @@ -1,22 +1,89 @@ -github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= +github.com/acomagu/trie/v2 v2.0.0 h1:4/Vt77FUj6qtYl7IN/2QMyl22ztBT0Cr3wg3kL/9mHg= +github.com/acomagu/trie/v2 v2.0.0/go.mod h1:trIf+o9oABbDJULhZ+jUiE5HjfO29H30dQV5PV+P8DA= +github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= +github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= +github.com/bitly/go-hostpool v0.0.0-20171023180738-a3a6125de932 h1:mXoPYz/Ul5HYEDvkta6I8/rnYM5gSdSV2tJ6XbZuEtY= +github.com/bitly/go-hostpool v0.0.0-20171023180738-a3a6125de932/go.mod h1:NOuUCSz6Q9T7+igc/hlvDOUdtWKryOrtFyIVABv/p7k= +github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869 h1:DDGfHa7BWjL4YnC6+E63dPcxHo2sUxDIu8g3QgEJdRY= +github.com/bmizerany/assert v0.0.0-20160611221934-b7ed37b82869/go.mod h1:Ekp36dRnpXw/yCqJaO+ZrUyxD+3VXMFFr56k5XYrpB4= +github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= +github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/douban/gobeansdb v1.1.2 h1:3PbCOWFheciPWc3dgRJF+uUfsorF6RgMN7y4TmyX3Fk= -github.com/douban/gobeansdb v1.1.2/go.mod h1:lSxWZJgFFUTy4WzcXTVn0g1hvEMwW3InXAUeZngsvMk= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/douban/gobeansdb v1.1.3 h1:ZfAUkOSS9QGk2XKQOWLfmqPHMYrI4CcGOlnvRQ5OAAk= +github.com/douban/gobeansdb v1.1.3/go.mod h1:pVfoirQu5pt26ig5w5yGrLXrVRzbn2/mcJu6uo/NZU4= +github.com/gocql/gocql v1.5.2 h1:WnKf8xRQImcT/KLaEWG2pjEeryDB7K0qQN9mPs1C58Q= +github.com/gocql/gocql v1.5.2/go.mod h1:3gM2c4D3AnkISwBxGnMMsS8Oy4y2lhbPRsH4xnJrHG8= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= +github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/snappy v0.0.3/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= +github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= +github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38= +github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed h1:5upAirOpQc1Q53c0bnx2ufif5kANL7bfZWcc6VJWJd8= +github.com/hailocab/go-hostpool v0.0.0-20160125115350-e80d13ce29ed/go.mod h1:tMWxXQ9wFIaZeTI9F+hmhFiGpFmhOHzyShyFUhRm0H4= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/matryer/is v1.2.0 h1:92UTHpy8CDwaJ08GqLDzhhuixiBUUD1p3AU6PHddz4A= +github.com/matttproud/golang_protobuf_extensions v1.0.4 h1:mmDVorXM7PCGKw94cs5zkfA9PSy5pEvNWRP0ET0TIVo= +github.com/matttproud/golang_protobuf_extensions v1.0.4/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/samuel/go-zookeeper v0.0.0-20180130194729-c4fab1ac1bec h1:6ncX5ko6B9LntYM0YBRXkiSaZMmLYeZ/NWcmeB43mMY= -github.com/samuel/go-zookeeper v0.0.0-20180130194729-c4fab1ac1bec/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E= -github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= -github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w= -github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= -golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= -golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= -golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= -golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= -golang.org/x/tools v0.0.0-20191011211836-4c025a95b26e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= -golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +github.com/prometheus/client_golang v1.16.0 h1:yk/hx9hDbrGHovbci4BY+pRMfSuuat626eFsHb7tmT8= +github.com/prometheus/client_golang v1.16.0/go.mod h1:Zsulrv/L9oM40tJ7T815tM89lFEugiJ9HzIqaAx4LKc= +github.com/prometheus/client_model v0.3.0 h1:UBgGFHqYdG/TPFD1B1ogZywDqEkwp3fBMvqdiQ7Xew4= +github.com/prometheus/client_model v0.3.0/go.mod h1:LDGWKZIo7rky3hgvBe+caln+Dr3dPggB5dvjtD7w9+w= +github.com/prometheus/common v0.42.0 h1:EKsfXEYo4JpWMHH5cg+KOUWeuJSov1Id8zGR8eeI1YM= +github.com/prometheus/common v0.42.0/go.mod h1:xBwqVerjNdUDjgODMpudtOMwlOwf2SaTr1yjz4b7Zbc= +github.com/prometheus/procfs v0.10.1 h1:kYK1Va/YMlutzCGazswoHKo//tZVlFpKYh+PymziUAg= +github.com/prometheus/procfs v0.10.1/go.mod h1:nwNm2aOCAYw8uTR/9bWRREkZFxAUcWzPHWJq+XBB/FM= +github.com/rogpeppe/go-internal v1.11.0 h1:cWPaGQEPrBb5/AsnsZesgZZ9yb1OQ+GOISoDNXVBh4M= +github.com/rogpeppe/go-internal v1.11.0/go.mod h1:ddIwULY96R17DhadqLgMfk9H9tvdUzkipdSkR5nkCZA= +github.com/samuel/go-zookeeper v0.0.0-20190923202752-2cc03de413da h1:p3Vo3i64TCLY7gIfzeQaUJ+kppEO5WQG3cL8iE8tGHU= +github.com/samuel/go-zookeeper v0.0.0-20190923202752-2cc03de413da/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= +github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +golang.org/x/exp v0.0.0-20230310171629-522b1b587ee0 h1:LGJsf5LRplCck6jUCH3dBL2dmycNruWNF5xugkSlfXw= +golang.org/x/exp v0.0.0-20230310171629-522b1b587ee0/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= +golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0 h1:ftCYgMx6zT/asHUrPw8BLLscYtGznsLAnjq5RH9P66E= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.10.0 h1:SqMFp9UcQJZa+pmYuAKjd9xq1f0j5rLcDIk0mj4qAsA= +golang.org/x/sys v0.10.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= +google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= +google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= +google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v2 v2.2.1 h1:mUhvW9EsL+naU5Q3cakzfE91YhliOondGd6ZrsDBHQE= -gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= +gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/natefinch/lumberjack.v2 v2.2.1 h1:bBRl1b0OH9s/DuPhuXpNl+VtCaJXFZ5/uEFST95x9zc= +gopkg.in/natefinch/lumberjack.v2 v2.2.1/go.mod h1:YD8tP3GAjkrDg1eZH7EGmyESg/lsYskCTPBJVb9jqSc= +gopkg.in/yaml.v2 v2.2.7/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/gobeansproxy/gobeansproxy.go b/gobeansproxy/gobeansproxy.go index 389efeb..b4eaacd 100644 --- a/gobeansproxy/gobeansproxy.go +++ b/gobeansproxy/gobeansproxy.go @@ -54,8 +54,14 @@ func Main() { config.Version, proxyConf.Port, proxyConf) logger.Infof("route table: %#v", config.Route) - dstore.InitGlobalManualScheduler(config.Route, proxyConf.N) + if proxyConf.DStoreConfig.Enable { + dstore.InitGlobalManualScheduler(config.Route, proxyConf.N, proxyConf.Scheduler) + } storage := new(dstore.Storage) + err := storage.InitStorageEngine(proxyConf) + if err != nil { + log.Fatalf("Init storage engine err: %s", err) + } addr := fmt.Sprintf("%s:%d", proxyConf.Listen, proxyConf.Port) server = mc.NewServer(storage) server.Listen(addr) diff --git a/gobeansproxy/web.go b/gobeansproxy/web.go index 363f561..212ddfa 100644 --- a/gobeansproxy/web.go +++ b/gobeansproxy/web.go @@ -3,8 +3,10 @@ package gobeansproxy import ( "encoding/json" "fmt" + "io/ioutil" "net/http" _ "net/http/pprof" + "path" "path/filepath" "runtime" "strconv" @@ -16,9 +18,12 @@ import ( dbcfg "github.com/douban/gobeansdb/config" mc "github.com/douban/gobeansdb/memcache" "github.com/douban/gobeansdb/utils" + "github.com/douban/gobeansproxy/cassandra" "github.com/douban/gobeansproxy/config" "github.com/douban/gobeansproxy/dstore" + "github.com/prometheus/client_golang/prometheus/promhttp" + yaml "gopkg.in/yaml.v2" ) @@ -55,6 +60,7 @@ func handleJson(w http.ResponseWriter, v interface{}) { } } + type templateHandler struct { once sync.Once filename string @@ -119,6 +125,12 @@ func startWeb() { http.HandleFunc("/route/", handleRoute) http.HandleFunc("/route/version", handleRouteVersion) http.HandleFunc("/route/reload", handleRouteReload) + http.Handle( + "/metrics", + promhttp.HandlerFor(dstore.BdbProxyPromRegistry, + promhttp.HandlerOpts{Registry: dstore.BdbProxyPromRegistry}), + ) + http.HandleFunc("/cstar-cfg", handleCstarCfgReload) webaddr := fmt.Sprintf("%s:%d", proxyConf.Listen, proxyConf.WebPort) go func() { @@ -209,6 +221,11 @@ func getFormValueInt(r *http.Request, name string, ndefault int) (n int, err err } func handleRouteReload(w http.ResponseWriter, r *http.Request) { + if !proxyConf.DStoreConfig.Enable { + w.Write([]byte("err: dstore not enabled")) + return + } + var err error if !dbcfg.AllowReload { w.Write([]byte("err: reloading")) @@ -225,42 +242,206 @@ func handleRouteReload(w http.ResponseWriter, r *http.Request) { } }() - if len(proxyConf.ZKServers) == 0 { - w.Write([]byte("err: not using zookeeper")) - return + if proxyConf.DStoreConfig.Scheduler == dstore.BucketsManualSchduler { + if len(proxyConf.ZKServers) == 0 { + w.Write([]byte("err: not using zookeeper")) + return + } + + defer handleWebPanic(w) + + r.ParseForm() + ver, err := getFormValueInt(r, "ver", -1) + if err != nil { + return + } + + newRouteContent, ver, err := dbcfg.ZKClient.GetRouteRaw(ver) + if ver == dbcfg.ZKClient.Version { + w.Write([]byte(fmt.Sprintf("warn: same version %d", ver))) + return + } + info := fmt.Sprintf("update with route version %d\n", ver) + logger.Infof(info) + newRoute := new(dbcfg.RouteTable) + err = newRoute.LoadFromYaml(newRouteContent) + if err != nil { + return + } + + oldScheduler := dstore.GetScheduler() + dstore.InitGlobalManualScheduler(newRoute, proxyConf.N, proxyConf.Scheduler) + config.Route = newRoute + dbcfg.ZKClient.Version = ver + w.Write([]byte("ok")) + + go func() { + // sleep for request to be completed. + time.Sleep(time.Duration(proxyConf.ReadTimeoutMs) * time.Millisecond * 5) + logger.Infof("scheduler closing when reroute, request: %v", r) + oldScheduler.Close() + }() + } else { + routePath := path.Join(proxyConf.Confdir, "route.yaml") + route, err := dbcfg.LoadRouteTableLocal(routePath) + if err != nil { + w.Write([]byte(fmt.Sprintf("%s", err))) + return + } + dstore.InitGlobalManualScheduler(route, proxyConf.N, proxyConf.Scheduler) + config.Route = route + w.Write([]byte("ok")) } +} + +type ReloadableCfg struct { + Cfg map[string]string `json:"cfg"` + Message string `json:"message"` + Error string `json:"error"` +} +func handleCstarCfgReload(w http.ResponseWriter, r *http.Request) { defer handleWebPanic(w) - r.ParseForm() - ver, err := getFormValueInt(r, "ver", -1) - if err != nil { + w.Header().Set("Content-Type", "application/json") + resp := make(map[string]string) + cfgName := r.URL.Query().Get("config") + var dispatcher cassandra.PrefixDisPatcher + + switch cfgName { + case "tablefinder": + if dstore.PrefixTableFinder == nil { + resp["error"] = "cassandra is disabled" + w.WriteHeader(http.StatusBadRequest) + handleJson(w, resp) + return + } + dispatcher = dstore.PrefixTableFinder + case "rwswitcher": + if dstore.PrefixStorageSwitcher == nil { + resp["error"] = "cassandra is disabled" + w.WriteHeader(http.StatusBadRequest) + handleJson(w, resp) + return + } + dispatcher = dstore.PrefixStorageSwitcher + default: + resp["error"] = "unsupported config query arg, must be: tablefinder/rwswitcher" + w.WriteHeader(http.StatusBadRequest) + handleJson(w, resp) return } - newRouteContent, ver, err := dbcfg.ZKClient.GetRouteRaw(ver) - if ver == dbcfg.ZKClient.Version { - w.Write([]byte(fmt.Sprintf("warn: same version %d", ver))) - return - } - info := fmt.Sprintf("update with route version %d\n", ver) - logger.Infof(info) - newRoute := new(dbcfg.RouteTable) - err = newRoute.LoadFromYaml(newRouteContent) - if err != nil { + switch r.Method { + case "GET": + response := ReloadableCfg{ + Cfg: dispatcher.GetCurrentMap(), + } + response.Message = "success" + w.WriteHeader(http.StatusOK) + handleJson(w, response) return - } + case "POST": + staticCfg, err := dispatcher.LoadStaticCfg(config.Proxy.Confdir) + if err != nil { + resp["error"] = fmt.Sprintf("load static cfg err: %s", err) + break + } - oldScheduler := dstore.GetScheduler() - dstore.InitGlobalManualScheduler(newRoute, proxyConf.N) - config.Route = newRoute - dbcfg.ZKClient.Version = ver - w.Write([]byte("ok")) + err = dispatcher.LoadCfg(staticCfg, dstore.CqlStore) + if err != nil { + resp["error"] = fmt.Sprintf("load cfg from db err: %s", err) + break + } + resp["message"] = "ok" + case "PUT": + // load cfg static + staticCfg, err := dispatcher.LoadStaticCfg(config.Proxy.Confdir) + if err != nil { + resp["error"] = fmt.Sprintf("load static cfg err: %s", err) + break + } - go func() { - // sleep for request to be completed. - time.Sleep(time.Duration(proxyConf.ReadTimeoutMs) * time.Millisecond * 5) - logger.Infof("scheduler closing when reroute, request: %v", r) - oldScheduler.Close() - }() + // upsert new data to db + b, err := ioutil.ReadAll(r.Body) + if err != nil { + resp["error"] = fmt.Sprintf("get body from req err: %s", err) + break + } + defer r.Body.Close() + var data map[string](map[string][]string) + err = json.Unmarshal(b, &data) + if err != nil { + resp["error"] = fmt.Sprintf("parse req err: %s", err) + break + } + pdata, ok := data["prefix"] + if !ok { + resp["error"] = fmt.Sprintf("parse req err: doesn't match {'prefix': {'': ['prefix1', 'prefix2']}}") + break + } + err = dispatcher.Upsert(staticCfg, pdata, dstore.CqlStore) + if err != nil { + resp["error"] = fmt.Sprintf("upsert data %v err: %s", data, err) + break + } + + // require load cfg actually + err = dispatcher.LoadCfg(staticCfg, dstore.CqlStore) + if err != nil { + resp["error"] = fmt.Sprintf("load cfg to server err: %s", err) + break + } + case "DELETE": + // load cfg static + staticCfg, err := dispatcher.LoadStaticCfg(config.Proxy.Confdir) + if err != nil { + resp["error"] = fmt.Sprintf("load static cfg err: %s", err) + break + } + + // upsert new data to db + b, err := ioutil.ReadAll(r.Body) + if err != nil { + resp["error"] = fmt.Sprintf("get body from req err: %s", err) + break + } + defer r.Body.Close() + var data map[string]string + err = json.Unmarshal(b, &data) + if err != nil { + resp["error"] = fmt.Sprintf("parse req err: %s", err) + break + } + + prefix, ok := data["prefix"] + if !ok { + resp["error"] = fmt.Sprintf("req data should like: {'prefix': }") + break + } + err = dispatcher.DeletePrefix(staticCfg, prefix, dstore.CqlStore) + if err != nil { + resp["error"] = fmt.Sprintf("upsert data %v err: %s", data, err) + break + } + + // require load cfg actually + err = dispatcher.LoadCfg(staticCfg, dstore.CqlStore) + if err != nil { + resp["error"] = fmt.Sprintf("load cfg to server err: %s", err) + break + } + default: + w.WriteHeader(http.StatusBadRequest) + resp["error"] = "unsupported method" + } + + + if _, ok := resp["error"]; ok { + w.WriteHeader(http.StatusBadGateway) + } else { + w.WriteHeader(http.StatusOK) + resp["message"] = "success" + } + handleJson(w, resp) } diff --git a/misc/gobeansdb_server.sh b/misc/gobeansdb_server.sh deleted file mode 100755 index 46b91a5..0000000 --- a/misc/gobeansdb_server.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash -# based on http://serverfault.com/a/360230/105615 -# - -# 注意这里的端口号需要和 gobeansproxy/conf/route.yaml 的端口号一致 -ports="7980 7981 7982 7983" -cmd="`which gobeansdb`" - -basedir="/tmp/gobeansdb_$USER" -prog="gobeansdb" - -function gen_conf() -{ - ./tests/gen_config.py -d $basedir -} - -function start() -{ - port="$1" - if [ `ps -ef | grep "$cmd" | grep -c $port` -ge 1 ]; then - echo "Beansdb server already started on port '$port'" - else - gen_conf - $cmd -confdir $basedir/$port/conf &> /dev/null & - echo "Starting the beansdb server on port '$port'... " - fi -} - -function stop() -{ - port="$1" - if [ `ps -ef | grep "$cmd" | grep -c $port` -eq 0 ]; then - echo $"Stopped the beansdb server on port '$port'... " - else - kill -TERM `ps -ef | grep "$cmd" | grep $port | grep -v grep | awk '{ print $2 }'` - echo "Stopping the beansdb server on port '$port'... " - fi -} - -case "$1" in - start) - if [ -n "$2" ]; then - rm -rf $basedir/$port - start $2 - else - rm -rf $basedir - for port in $ports; do - start $port - done - fi - ;; - stop) - if [ -n "$2" ]; then - port="$2" - stop $port - rm -rf $basedir/$port - else - for port in $ports; do - stop $port - done - rm -rf $basedir - fi - ;; - restart) - if [ -n "$2" ]; then - stop $2 - sleep 1 - start $2 - else - for port in $ports; do - stop $port - sleep 1 - start $port - done - fi - ;; - *) - printf 'Usage: %s {start|stop|restart} [port]\n' "$prog" - exit 1 - ;; -esac diff --git a/pip-req.d/dev.txt b/pip-req.d/dev.txt new file mode 100644 index 0000000..451a564 --- /dev/null +++ b/pip-req.d/dev.txt @@ -0,0 +1,5 @@ +cqlsh + +pytest +requests +pyyaml diff --git a/tests/abnormal_cmd_test.py b/tests/abnormal_cmd_test.py index 8f56619..5e1a6c8 100644 --- a/tests/abnormal_cmd_test.py +++ b/tests/abnormal_cmd_test.py @@ -11,12 +11,12 @@ def setUp(self): def run_cmd_by_telnet(self, cmd, expected, timeout=2): addr, port = self.proxy.addr.split(':') t = telnetlib.Telnet(addr, port) - t.write('%s\r\n' % cmd) - out = t.read_until('\n', timeout=timeout) - t.write('quit\n') + t.write(f'{cmd}\r\n'.encode()) + out = t.read_until(b'\n', timeout=timeout) + t.write(b'quit\n') t.close() - r = out.strip('\r\n') - self.assertEqual(r, expected) + r = out.strip(b'\r\n') + self.assertEqual(r, expected.encode()) def test_get(self): # get not exist key @@ -31,19 +31,35 @@ def test_get(self): def test_set(self): # invalid key cmd = 'set %s 0 0 3\r\naaa' % self.invalid_key - self.run_cmd_by_telnet(cmd, 'SERVER_ERROR write failed') + if self.bdb_write_enable: + self.run_cmd_by_telnet(cmd, 'SERVER_ERROR write failed') + elif self.cstar_write_enable: + self.run_cmd_by_telnet(cmd, 'STORED') cmd = 'set /test/set 0 0 3\r\naaaa' self.run_cmd_by_telnet(cmd, 'CLIENT_ERROR bad data chunk') + self.checkCounterZero() def test_delete(self): key = '/delete/not/exist/key' cmd = 'delete %s' % key - self.run_cmd_by_telnet(cmd, 'NOT_FOUND') + if self.bdb_write_enable and not self.cstar_write_enable: + self.run_cmd_by_telnet(cmd, 'NOT FOUND') + + if self.cstar_write_enable: + # cstar will delete a key event if not exists + # it just write a tombestone to mem + self.run_cmd_by_telnet(cmd, 'DELETED') cmd = 'delete %s' % self.invalid_key - self.run_cmd_by_telnet(cmd, 'NOT_FOUND') + if self.bdb_write_enable and not self.cstar_write_enable: + self.run_cmd_by_telnet(cmd, 'NOT FOUND') + + if self.cstar_write_enable: + # cstar will delete a key event if not exists + # it just write a tombestone to mem + self.run_cmd_by_telnet(cmd, 'DELETED') self.checkCounterZero() def test_get_meta_by_key(self): diff --git a/tests/base.py b/tests/base.py index 62f2d3e..9427bd5 100644 --- a/tests/base.py +++ b/tests/base.py @@ -15,29 +15,44 @@ GOBEANSDB_CMD = "gobeansdb" -GOBEANSPROXY_CMD = "../../../../bin/gobeansproxy" +GOBEANSPROXY_CMD = f"{os.environ['GOPATH']}/bin/gobeansproxy" +GOBEANSDB_READ_ENABLE = os.environ.get("GOBEANSPROXY_TEST_BR") == "1" +GOBEANSDB_WRITE_ENABLE = os.environ.get("GOBEANSPROXY_TEST_BW") == "1" +CSTAR_READ_ENABLE = os.environ.get("GOBEANSPROXY_TEST_CR") == "1" +CSTAR_WRITE_ENABLE = os.environ.get("GOBEANSPROXY_TEST_CW") == "1" class BaseTest(unittest.TestCase): def setUp(self): root_dir = '/tmp/gobeansproxy_%s/' % uuid.uuid4() - gen_config.gen_conf(root_dir) + self.bdb_read_enable = GOBEANSDB_READ_ENABLE + self.bdb_write_enable = GOBEANSDB_WRITE_ENABLE + self.cstar_read_enable = CSTAR_READ_ENABLE + self.cstar_write_enable = CSTAR_WRITE_ENABLE + gen_config.gen_conf( + root_dir, + bdb_read_enable=self.bdb_read_enable, + bdb_write_enable=self.bdb_write_enable, + cstar_read_enable=self.cstar_read_enable, + cstar_write_enable=self.cstar_write_enable + ) self.dbs = [GobeansdbInstance(os.path.join(root_dir, str(port), 'conf')) for (port, _) in gen_config.MAIN_PORT_PAIRS] for db in self.dbs: - db.start() + db.start(self.bdb_read_enable) self.backup_dbs = [GobeansdbInstance(os.path.join(root_dir, str(port), 'conf')) for (port, _) in gen_config.BACKUP_PORT_PAIRS] for db in self.backup_dbs: - db.start() + db.start(self.bdb_read_enable) self.proxy = GobeansproxyInstance( os.path.join(root_dir, 'proxy', 'conf')) - self.proxy.start() + self.proxy.start(self.bdb_read_enable) def tearDown(self): + # time.sleep(1000) self.proxy.clean() for db in self.dbs: db.clean() @@ -45,14 +60,31 @@ def tearDown(self): db.clean() def checkCounterZero(self): + if not (self.bdb_read_enable or self.bdb_write_enable): + return time.sleep(0.5) content = gethttp(self.proxy.webaddr, 'buffer') buffers = json.loads(content) self.assertEqual(len(buffers), 4) - for _, v in buffers.items(): + for _, v in list(buffers.items()): self.assertEqual(v['Count'], 0, content) self.assertEqual(v['Size'], 0, content) + @classmethod + def require_rw_enable(func, br, bw, cr, cw): + def wrap(func): + def check_rw_func(*args, **kwargs): + if not (GOBEANSDB_READ_ENABLE in br and \ + GOBEANSDB_WRITE_ENABLE in bw and \ + CSTAR_READ_ENABLE in cr and \ + CSTAR_WRITE_ENABLE in cw): + return + return func(*args, **kwargs) + return check_rw_func + + return wrap + + class BaseServerInstance(object): def __init__(self, conf_dir, bin, server_name): @@ -60,14 +92,17 @@ def __init__(self, conf_dir, bin, server_name): self.cmd = '%s -confdir %s' % (bin, conf_dir) self.addr, self.webaddr = get_server_addr(conf_dir, server_name) - def start(self): + def start(self, bdb_read_enable=True): assert self.popen is None self.popen = start_cmd(self.cmd) try_times = 0 while True: try: store = MCStore(self.addr) - store.get("@") + if bdb_read_enable: + store.get("@") + else: + store.set("test", "test") return except IOError: try_times += 1 @@ -76,7 +111,7 @@ def start(self): time.sleep(0.5) def stop(self): - print 'stop', self.cmd + print('stop', self.cmd) if self.popen: stop_cmd(self.popen) self.popen = None diff --git a/tests/basic_test.py b/tests/basic_test.py index e9fe865..8971f4b 100644 --- a/tests/basic_test.py +++ b/tests/basic_test.py @@ -1,11 +1,13 @@ # coding: utf-8 +import os + from tests.base import BaseTest from tests.dbclient import MCStore from tests.utils import random_string -VERSION, HASH, FLAG, SIZE, TIMESTAMP, CHUNKID, OFFSET = range(7) +VERSION, HASH, FLAG, SIZE, TIMESTAMP, CHUNKID, OFFSET = list(range(7)) class KeyVersionTest(BaseTest): def setUp(self): @@ -25,6 +27,7 @@ def get_meta(self, store, key): assert(len(meta) == 7) return tuple([int(meta[i]) for i in [VERSION, CHUNKID, OFFSET]]) + @BaseTest.require_rw_enable(br=(True,), bw=(True,), cr=(False,), cw=(True, False)) def test_incr(self): store = MCStore(self.proxy.addr) key = 'key1' @@ -32,6 +35,7 @@ def test_incr(self): self.assertEqual(store.get(key), 10) self.checkCounterZero() + @BaseTest.require_rw_enable(br=(True,), bw=(True,), cr=(False,), cw=(False,)) def test_set_version(self): store = MCStore(self.proxy.addr) key = 'key1' @@ -41,19 +45,20 @@ def test_set_version(self): self.assertEqual(store.get(key), 'aaa') self.assertEqual(self.get_meta(store, key), (1, 0, self.last_pos)) - store.set_raw(key, 'bbb', rev=3) + store.set_raw(key, b'bbb', rev=3) self.update_pos(256) self.assertEqual(self.get_meta(store, key), (3, 0, self.last_pos)) - store.set_raw(key, 'bbb', rev=4) + store.set_raw(key, b'bbb', rev=4) self.assertEqual(self.get_meta(store, key), (4, 0, self.last_pos)) - store.set_raw(key, 'ccc', rev=2) - self.assertEqual(store.get(key), 'bbb') + store.set_raw(key, b'ccc', rev=2) + self.assertEqual(store.get(key), b'bbb') self.assertEqual(self.get_meta(store, key), (4, 0, self.last_pos)) self.checkCounterZero() + @BaseTest.require_rw_enable(br=(True,), bw=(True,), cr=(False,), cw=(False,)) def test_delete_version(self): store = MCStore(self.proxy.addr) key = 'key1' @@ -77,7 +82,7 @@ def test_delete_version(self): def test_special_key(self): store = MCStore(self.proxy.addr) - kvs = [('a' * 200, 1), ('a', range(1000))] + kvs = [('a' * 200, 1), ('a', list(range(1000)))] for k, v in kvs: self.assertTrue(store.set(k, v)) self.assertEqual(store.get(k), v) @@ -89,17 +94,47 @@ def test_special_key(self): for (k, v) in kvs: v2 = store.get(k) self.assertEqual(v2, v, "key %s, value %s, not %s" % (k, v, v2)) - self.checkCounterZero() + if not self.cstar_write_enable: + self.checkCounterZero() + + def test_big_v(self): + store = MCStore(self.proxy.addr) + key = 'largekeykk' + size = 10 * 1024 * 1024 + string_large = random_string(size // 10) * 10 + self.assertTrue(store.set(key, string_large)) + self.assertEqual(store.get(key), string_large) + + def test_env(self): + self.assertEqual( + os.environ.get("GOBEANSPROXY_TEST_BR") == "1", + self.bdb_read_enable + ) + self.assertEqual( + os.environ.get("GOBEANSPROXY_TEST_BW") == "1", + self.bdb_write_enable + ) + self.assertEqual( + os.environ.get("GOBEANSPROXY_TEST_CR") == "1", + self.cstar_read_enable + ) + self.assertEqual( + os.environ.get("GOBEANSPROXY_TEST_CW") == "1", + self.cstar_write_enable + ) + + @BaseTest.require_rw_enable(br=(True,), bw=(True,), cr=(False,), cw=(False,)) def test_big_value(self): store = MCStore(self.proxy.addr) key = 'largekey' size = 10 * 1024 * 1024 - rsize = (((size + len(key) + 24) >> 8) + 1) << 8 - string_large = random_string(size / 10) * 10 + string_large = random_string(size // 10) * 10 self.assertTrue(store.set(key, string_large)) self.assertEqual(store.get(key), string_large) + + rsize = (((size + len(key) + 24) >> 8) + 1) << 8 self.update_pos(rsize) self.assertEqual(self.get_meta(store, key), (1, 0, self.last_pos)) diff --git a/tests/dbclient.py b/tests/dbclient.py index 96897a9..6b7e217 100644 --- a/tests/dbclient.py +++ b/tests/dbclient.py @@ -11,7 +11,7 @@ def connect(server, **kwargs): comp_threshold=comp_threshold, prefix=prefix) c.config(libmc.MC_CONNECT_TIMEOUT, 300) # 0.3s - c.config(libmc.MC_POLL_TIMEOUT, 3000) # 3s + c.config(libmc.MC_POLL_TIMEOUT, 10000) # 3s c.config(libmc.MC_RETRY_TIMEOUT, 5) # 5s return c @@ -83,4 +83,4 @@ def exists(self, key): return False def incr(self, key, value): - return self.mc.incr(key, int(value)) \ No newline at end of file + return self.mc.incr(key, int(value)) diff --git a/tests/gen_config.py b/tests/gen_config.py index 1825472..9b24454 100755 --- a/tests/gen_config.py +++ b/tests/gen_config.py @@ -84,7 +84,9 @@ 'max_connect_errors': 3, 'score_deviation': 10, 'item_size_stats': 4096, - 'response_time_min': 4000 + 'response_time_min': 4000, + 'enable_write': True, + 'enable_read': False, }, 'mc': { 'body_big_str': '5M', @@ -104,7 +106,15 @@ 'webport': 7908, 'zkservers': ["zk1:2181"], 'zkpath': "/gobeansproxy/test", - } + }, + 'cassandra': { + 'enable_read': True, + 'enable_write': True, + 'hosts': ["127.0.0.1:9042"], + 'default_key_space': 'doubandb', + 'default_table': 'kvstore', + 'timeout_sec': 5, + }, } ### 注意这里的端口号需要和 gobeansproxy/conf/route.yaml 的端口号一致 @@ -130,7 +140,9 @@ def main(): def gen_conf(root_dir, main_port_pairs=MAIN_PORT_PAIRS, backup_port_pairs=BACKUP_PORT_PAIRS, - proxy_port_pairs=PROXY_PORT_PAIRS): + proxy_port_pairs=PROXY_PORT_PAIRS, + bdb_read_enable=True, bdb_write_enable=True, + cstar_read_enable=False, cstar_write_enable=False): ports = [x[0] for x in main_port_pairs] backup_ports = [x[0] for x in backup_port_pairs] route_conf = gen_route_conf(ports, backup_ports) @@ -141,8 +153,11 @@ def gen_conf(root_dir, proxy_dir = gen_dir(root_dir, 'proxy') proxy_conf_dir = gen_dir(proxy_dir, 'conf') - proxy_conf = gen_proxy_conf(proxy_dir, proxy_port_pairs[0], - proxy_port_pairs[1]) + proxy_conf = gen_proxy_conf( + proxy_dir, proxy_port_pairs[0], proxy_port_pairs[1], + bdb_read_enable, bdb_write_enable, + cstar_read_enable, cstar_write_enable + ) yaml_dump(proxy_conf, join(proxy_conf_dir, 'proxy.yaml')) yaml_dump(route_conf, join(proxy_conf_dir, 'route.yaml')) @@ -195,12 +210,24 @@ def gen_route_conf(ports, backup_ports, numbucket=16): return tmpl -def gen_proxy_conf(logdir, port, webport): +def gen_proxy_conf( + logdir, port, webport, + bdb_read_enable=True, bdb_write_enable=True, + cstar_read_enable=False, cstar_write_enable=False): tmpl = copy.deepcopy(proxy_conf_tmpl) tmpl['proxy']['errorlog'] = os.path.join(logdir, 'error.log') tmpl['proxy']['accesslog'] = os.path.join(logdir, 'access.log') tmpl['proxy']['port'] = port tmpl['proxy']['webport'] = webport + + assert (bdb_read_enable or cstar_read_enable) \ + and (bdb_write_enable or cstar_write_enable), \ + 'must enable at least one engine' + + tmpl['cassandra']['enable_read'] = cstar_read_enable + tmpl['cassandra']['enable_write'] = cstar_write_enable + tmpl['dstore']['enable_read'] = bdb_read_enable + tmpl['dstore']['enable_write'] = bdb_write_enable return tmpl diff --git a/tests/pip-req.txt b/tests/pip-req.txt index 5882592..70f174d 100644 --- a/tests/pip-req.txt +++ b/tests/pip-req.txt @@ -1,3 +1,3 @@ -PyYAML==3.11 +PyYAML libmc>=0.5.6 -nose>=1.3.7 +nose3 diff --git a/tests/run_test.sh b/tests/run_test.sh index ede3ce3..087590b 100755 --- a/tests/run_test.sh +++ b/tests/run_test.sh @@ -3,5 +3,34 @@ virtualenv venv source venv/bin/activate venv/bin/python venv/bin/pip install -r tests/pip-req.txt -venv/bin/python venv/bin/nosetests --with-xunit --xunit-file=unittest.xml -deactivate \ No newline at end of file + +# echo ">> test beansdb rw ..." +# export GOBEANSPROXY_TEST_BR=1 GOBEANSPROXY_TEST_BW=1 +# export GOBEANSPROXY_TEST_CR=0 GOBEANSPROXY_TEST_CW=0 +# venv/bin/python venv/bin/nosetests --with-xunit --xunit-file=unittest.xml + +echo ">> test beansdb/cstar dual write, bdb read ..." +export GOBEANSPROXY_TEST_BR=1 GOBEANSPROXY_TEST_BW=1 +export GOBEANSPROXY_TEST_CR=0 GOBEANSPROXY_TEST_CW=1 +venv/bin/python \ + venv/bin/nosetests \ + --with-xunit -v \ + --xunit-file="unittest-br${GOBEANSPROXY_TEST_BR}-bw${GOBEANSPROXY_TEST_BW}-cr${GOBEANSPROXY_TEST_CR}-cw${GOBEANSPROXY_TEST_CW}.xml" + +echo ">> test beansdb/cstar dual write. cstar read ..." +export GOBEANSPROXY_TEST_BR=0 GOBEANSPROXY_TEST_BW=1 +export GOBEANSPROXY_TEST_CR=1 GOBEANSPROXY_TEST_CW=1 +venv/bin/python \ + venv/bin/nosetests \ + --with-xunit -v \ + --xunit-file="unittest-br${GOBEANSPROXY_TEST_BR}-bw${GOBEANSPROXY_TEST_BW}-cr${GOBEANSPROXY_TEST_CR}-cw${GOBEANSPROXY_TEST_CW}.xml" + +echo ">> test cstar rw ..." +export GOBEANSPROXY_TEST_BR=0 GOBEANSPROXY_TEST_BW=0 +export GOBEANSPROXY_TEST_CR=1 GOBEANSPROXY_TEST_CW=1 +venv/bin/python \ + venv/bin/nosetests \ + --with-xunit -v \ + --xunit-file="unittest-br${GOBEANSPROXY_TEST_BR}-bw${GOBEANSPROXY_TEST_BW}-cr${GOBEANSPROXY_TEST_CR}-cw${GOBEANSPROXY_TEST_CW}.xml" + +deactivate diff --git a/tests/server_down_test.py b/tests/server_down_test.py index 4d14dc3..5bdef08 100644 --- a/tests/server_down_test.py +++ b/tests/server_down_test.py @@ -8,10 +8,18 @@ class KeyVersionTest(BaseTest): def setUp(self): BaseTest.setUp(self) + # def tearDown(self): + # import time + # time.sleep(600) + # super(self).tearDown() + def _assert_val(self, addr, key, val, msg=None): store = MCStore(addr) self.assertEqual(store.get(key), val, msg) + @BaseTest.require_rw_enable( + br=(True, False), bw=(True,), cr=(False,), cw=(False, True) + ) def test_normal(self): proxy = MCStore(self.proxy.addr) key = '/test/normal/key' @@ -22,6 +30,9 @@ def test_normal(self): for db in self.dbs: self._assert_val(db.addr, key, val) + @BaseTest.require_rw_enable( + br=(True, False), bw=(True,), cr=(False,), cw=(False, True) + ) def test_one_server_down(self): proxy = MCStore(self.proxy.addr) key = '/test/one/server/down' @@ -39,6 +50,9 @@ def test_one_server_down(self): for db in self.backup_dbs: self._assert_val(db.addr, key, None) + @BaseTest.require_rw_enable( + br=(True, False), bw=(True,), cr=(False,), cw=(False, True) + ) def test_two_server_down(self): proxy = MCStore(self.proxy.addr) key = '/test/two/server/down' @@ -57,6 +71,9 @@ def test_two_server_down(self): for db in self.backup_dbs: self._assert_val(db.addr, key, val) + @BaseTest.require_rw_enable( + br=(True, False), bw=(True,), cr=(False,), cw=(False, True) + ) def test_three_server_down(self): proxy = MCStore(self.proxy.addr) key = '/test/three/server/down' diff --git a/tests/switch_storage_test.py b/tests/switch_storage_test.py new file mode 100644 index 0000000..013d0c8 --- /dev/null +++ b/tests/switch_storage_test.py @@ -0,0 +1,215 @@ +import os +import pytest +import requests +import yaml + +from yaml import load, dump +try: + from yaml import CLoader as Loader, CDumper as Dumper +except ImportError: + from yaml import Loader, Dumper + + +from .dbclient import MCStore as store + +store_addr = os.getenv("GOBEANSPROXY_ADDR") +store_api = os.getenv("GOBEANSPROXY_WEB") +store_proxy_cfg = os.getenv("GOBEANSPROXY_PROXY_CFG") or \ + '/home/project/.doubanpde/scripts/bdb/gobeansproxy/prefix-switch-cfg/conf/proxy.yaml' + +p_status_brw = 'br1w1cr0w0' +p_status_brw_cw = 'br1w1cr0w1' +p_status_bw_crw = 'br0w1cr1w1' +p_status_crw = 'br0w0cr1w1' + +order_of_status = { + p_status_brw: 0, + p_status_brw_cw: 1, + p_status_bw_crw: 2, + p_status_crw: 3, +} + + +class TestSwitchStorage: + + def setup_method(self): + self.client = store(store_addr or "127.0.0.1:47907") + self.prefix = "/__test_proxy/" + self.prefix_wont_switch = "/__test_proxy_no_switch/" + self.key_max = 100 + self.web_addr = store_api or "http://localhost:47910/cstar-cfg?config=rwswitcher" + self.web_req = requests.Session() + self.store_proxy_cfg_backup = store_proxy_cfg + '.backup' + # copy cfg bak + with open(store_proxy_cfg, 'r+') as f: + with open(self.store_proxy_cfg_backup, 'w') as b: + b.write(f.read()) + self.status = p_status_brw + self.switch_store(p_status_brw) + + def format_key(self, k): + return f"{self.prefix}{k}" + + def teardown_method(self): + self.web_req.close() + with open(self.store_proxy_cfg_backup, 'r+') as f: + with open(store_proxy_cfg, 'w') as o: + o.write(f.read()) + self.trigger_reload() + + @pytest.mark.parametrize("test_kv", [ + (1, 1), + ("str", "str"), + ("list", ["0", 1, 2]), + ("dict", {"1": 1, "2": 2, "3": 3}), + ("中文", "中文str"), + ("bytes", b'abcde'), + ("nesting", [{"abc中文": ["1", "2", "fff"]}]), + ]) + def test_curd_value(self, test_kv): + k, v = test_kv + key = self.format_key(k) + assert self.client.set(key, v) + assert self.client.get(key) == v + assert self.client.delete(key) + assert self.client.get(key) is None + + @pytest.mark.parametrize("test_kvs", [ + ( + (1, 1), + ("str", "str"), + ("list", ["0", 1, 2]), + ("dict", {"1": 1, "2": 2, "3": 3}), + ("中文", "中文str"), + ("bytes", b'abcde'), + ("nesting", [{"abc中文": ["1", "2", "fff"]}]), + ("bool", True), + ), + ]) + def test_getm_value(self, test_kvs): + getm_prefix = '__test_proxy_getm/' + r = {f'{getm_prefix}{k}': v for k, v in test_kvs} + assert len(r) == len(test_kvs) + keys = list(r.keys()) + + for k, v in r.items(): + assert self.client.set(k, v) + + result = self.client.get_multi(keys) + assert len(keys) == len(result) + for k, v in result.items(): + assert r[k] == v + assert self.client.delete(k) + + def trigger_reload(self): + resp = self.web_req.post(self.web_addr) + assert resp.json().get('message') == "success", 'failed, resp: {}'.format(resp.json()) + + def update_rw_dispatch_cfg(self, switch_to, prefixes): + data = { + "prefix": { + switch_to: prefixes + } + } + resp = self.web_req.put(self.web_addr, json=data) + assert 'error' not in resp.json() + + def clean_rw_dispatch_cfg(self, prefix): + data = { + "prefix": prefix + } + resp = self.web_req.delete(self.web_addr, json=data) + assert 'error' not in resp.json() + + def switch_store(self, switch_to, use_static_cfg=True): + assert switch_to in (p_status_brw, p_status_brw_cw, + p_status_bw_crw, p_status_crw) + if self.status == switch_to: + return + + self.clean_rw_dispatch_cfg(self.prefix) + with open(store_proxy_cfg, 'r+') as f: + data = load(f, Loader=Loader) + if use_static_cfg: + scfg = {switch_to: [self.prefix]} + else: + # we should clean static cfg cause this will + # conflict with our db cfg items + scfg = {} + + data['cassandra']['prefix_rw_dispatcher_cfg']['static'] = scfg + + f.seek(0, 0) + f.truncate() + + f.write(dump(data, Dumper=Dumper)) + if use_static_cfg: + self.trigger_reload() + else: + # using put api for cfg update + self.update_rw_dispatch_cfg(switch_to, [self.prefix]) + self.status = switch_to + + def test_switch_store(self): + + switch_to = [ + # bdb -> cassandra + ( + p_status_brw_cw, + p_status_bw_crw, + p_status_crw, + p_status_brw + ), + + # bdb -> cassandra dual write -> bdb -> cassandra + ( + p_status_brw_cw, + p_status_bw_crw, + p_status_brw_cw, + p_status_bw_crw, + p_status_crw, + p_status_brw + ), + ] + + key = self.format_key('switch_test') + value = 'value' + + no_switch_key = f'{self.prefix_wont_switch}notme' + no_switch_value = "static" + + assert self.client.set(key, value) + assert self.client.set(no_switch_key, no_switch_value) + + for use_static_cfg in (True, False): + for stages in switch_to: + last_stage = None + for idx, stage in enumerate(stages): + last_stage = self.status + self.switch_store(stage, use_static_cfg) + + # ensure we can still get values + # when change from crw -> other br status this is not going to equal + if stage in (p_status_brw, p_status_brw_cw) and last_stage == p_status_crw: + assert self.client.get(key) != value, f'stages: {stages} -> stage: {stage} error' + else: + assert self.client.get(key) == value, f'stages: {stages} -> stage: {stage} error' + assert self.client.get(no_switch_key) == no_switch_value, f'stages: {stages} -> stage: {stage} error' + + # ensure we can set to new value + value = f'value_on_{stage}' + assert self.client.set(key, value), f'stages: {stages} -> stage: {stage} error' + assert self.client.get(key) == value, f'stages: {stages} -> stage: {stage} error' + no_switch_value = f'static_on_{stage}' + assert self.client.set(no_switch_key, no_switch_value), f'stages: {stages} -> stage: {stage} error' + assert self.client.get(no_switch_key) == no_switch_value, f'stages: {stages} -> stage: {stage} error' + + # ensure we can delete value + assert self.client.delete(key), f'stages: {stages} -> stage: {stage} error' + assert self.client.get(key) is None, f'stages: {stages} -> stage: {stage} error' + assert self.client.set(key, value), f'stages: {stages} -> stage: {stage} error' + assert self.client.get(key) == value, f'stages: {stages} -> stage: {stage} error' + + self.switch_store(p_status_brw) + assert self.client.delete(key), f'stages: {stages} -> stage: {stage} error' + assert self.client.delete(no_switch_key), f'stages: {stages} -> stage: {stage} error' diff --git a/tests/utils.py b/tests/utils.py index 3af139b..cb9902f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,10 +1,14 @@ import os import errno import yaml +try: + from yaml import Cloader as Loader +except ImportError: + from yaml import Loader import string import socket import random -import urllib2 +import urllib.request, urllib.error, urllib.parse import time import shlex import subprocess @@ -24,24 +28,24 @@ def mkdir_p(path): def random_string(n): s = string.ascii_letters result = "" - for _ in xrange(n): + for _ in range(n): result += random.choice(s) return result def gethttp(addr, path): url = "http://%s/%s" % (addr, path) - response = urllib2.urlopen(url) + response = urllib.request.urlopen(url) return response.read() def start_cmd(cmd): - print "start", cmd + print("start", cmd) log_file = '/tmp/beansdb/log.txt' mkdir_p(os.path.dirname(log_file)) with open(log_file, 'a') as f: p = subprocess.Popen( - cmd if isinstance(cmd, (tuple, list,)) else shlex.split(cmd), + cmd if isinstance(cmd, (tuple, list)) else shlex.split(cmd), stderr=f, ) time.sleep(0.2) @@ -73,9 +77,9 @@ def get_server_addr(conf_dir, server_name): def port_to_addr(port): return '%s:%s' % (host, port) - return map(port_to_addr, [port, webport]) + return list(map(port_to_addr, [port, webport])) def load_yaml(filepath): with open(filepath) as f: - return yaml.load(f) + return yaml.load(f, Loader=Loader)