diff --git a/service_capacity_modeling/models/common.py b/service_capacity_modeling/models/common.py index 551fa47..1fe2014 100644 --- a/service_capacity_modeling/models/common.py +++ b/service_capacity_modeling/models/common.py @@ -57,7 +57,23 @@ def _sqrt_staffed_cores(rps: float, latency_s: float, qos: float) -> int: def sqrt_staffed_cores(desires: CapacityDesires) -> int: - """Computes cores given a sqrt staffing model""" + """Computes cores given a sqrt staffing model + + Little's Law: Concurrency = Average Rate * Average Latency + For example: 0.1 average concurrency = 100 / second * 1 millisecond + + However, if you provision for average, when statistically unlikely traffic + spikes happen, you will queue, creating _latency_. + + Square root staffing says to avoid that latency instead of provisioning + average number of cores, you provision + + Cores = (Rate * Latency) + (QoS * sqrt(Rate * Latency)) + Cores = (Required cores) + (Safety margin) + + Pick higher QoS to minimize the probability of queueing. In our case we do it + based on tier. + """ qos = _QOS(desires.service_tier) read_rps, read_lat = ( desires.query_pattern.estimated_read_per_second.mid, diff --git a/service_capacity_modeling/models/org/netflix/evcache.py b/service_capacity_modeling/models/org/netflix/evcache.py index a07cfd8..45b5190 100644 --- a/service_capacity_modeling/models/org/netflix/evcache.py +++ b/service_capacity_modeling/models/org/netflix/evcache.py @@ -45,10 +45,34 @@ class Replication(str, Enum): evicts = "evicts" +def calculate_read_cpu_time_evcache_ms(read_size_bytes: float) -> float: + # Fitted a curve based on some data that we crunched from couple of + # read heavy clusters + # In memory + # 250 bit - 10 micros + # 1520 bit - 41 micros + # 8250 bit - 66 micros + # On disk + # 24 KiB - 133 micros + # 40 KiB - 158 top of our curve + # Fit a logistic curve, requiring it to go through first + # point + read_latency_ms = \ + 979.4009 + (-0.06853492 - 979.4009)/math.pow((1 + math.pow(read_size_bytes/13061.23, 0.180864)), 0.0002819491) + return max(read_latency_ms, 0.005) + +def calculate_spread_cost(cluster_size: int, max_cost=100000, min_cost=0.0) -> float: + if cluster_size > 10: + return min_cost + if cluster_size < 2: + return max_cost + return min_cost + (max_cost - cluster_size * (max_cost - min_cost) / 30.0) + + def _estimate_evcache_requirement( instance: Instance, desires: CapacityDesires, - working_set: float, + working_set: Optional[float], copies_per_region: int, zones_per_region: int = 3, ) -> Tuple[CapacityRequirement, Tuple[str, ...]]: @@ -57,20 +81,25 @@ def _estimate_evcache_requirement( The input desires should be the **regional** desire, and this function will return the zonal capacity requirement """ - # EVCache can run at full CPU utilization + # EVCache needs to have headroom for region failover + needed_cores = sqrt_staffed_cores(desires) + # For tier 0, we double the number of cores to account for caution + if desires.service_tier == 0: + needed_cores = needed_cores * 2 + # (Arun): Keep 20% of available bandwidth for cache warmer needed_network_mbps = simple_network_mbps(desires) * 1.25 needed_disk = math.ceil( - desires.data_shape.estimated_state_size_gib.mid * copies_per_region, + desires.data_shape.estimated_state_size_gib.mid, ) regrets: Tuple[str, ...] = ("spend", "mem") # (Arun): As of 2021 we are using ephemerals exclusively and do not # use cloud drives - if instance.drive is None: + if working_set is None or desires.data_shape.estimated_state_size_gib.mid < 110.0: # We can't currently store data on cloud drives, but we can put the # dataset into memory! needed_memory = float(needed_disk) @@ -81,13 +110,9 @@ def _estimate_evcache_requirement( needed_memory = float(working_set) * float(needed_disk) regrets = ("spend", "disk", "mem") - # Now convert to per zone - needed_cores = max(1, needed_cores // zones_per_region) - if needed_disk > 0: - needed_disk = max(1, needed_disk // zones_per_region) - else: - needed_disk = needed_disk // zones_per_region - needed_memory = max(1, int(needed_memory // zones_per_region)) + # For EVCache, writes go to all zones + # Regional reads can also go to any one zone due to app's zone affinity + needed_cores = max(1, needed_cores) logger.debug( "Need (cpu, mem, disk, working) = (%s, %s, %s, %f)", needed_cores, @@ -146,16 +171,20 @@ def _estimate_evcache_cluster_zonal( # working set to keep more or less data in RAM. Faster drives need # less fronting RAM. ws_drive = instance.drive or drive - working_set = working_set_from_drive_and_slo( - drive_read_latency_dist=dist_for_interval(ws_drive.read_io_latency_ms), - read_slo_latency_dist=dist_for_interval( - desires.query_pattern.read_latency_slo_ms - ), - estimated_working_set=desires.data_shape.estimated_working_set_percent, - # Caches have very tight latency SLOs, so we target a high - # percentile of the drive latency distribution for WS calculation - target_percentile=0.99, - ).mid + + if ws_drive: + working_set = working_set_from_drive_and_slo( + drive_read_latency_dist=dist_for_interval(ws_drive.read_io_latency_ms), + read_slo_latency_dist=dist_for_interval( + desires.query_pattern.read_latency_slo_ms + ), + estimated_working_set=desires.data_shape.estimated_working_set_percent, + # Caches have very tight latency SLOs, so we target a high + # percentile of the drive latency distribution for WS calculation + target_percentile=0.99, + ).mid + else: + working_set = None requirement, regrets = _estimate_evcache_requirement( instance=instance, @@ -182,7 +211,6 @@ def reserve_memory(instance_mem_gib): return base_mem + variable_os requirement.context["osmem"] = reserve_memory(instance.ram_gib) - # EVCache clusters aim to be at least 2 nodes per zone to start # out with for tier 0 min_count = 0 @@ -208,7 +236,6 @@ def reserve_memory(instance_mem_gib): reserve_memory=lambda x: base_mem, core_reference_ghz=requirement.core_reference_ghz, ) - # Communicate to the actual provision that if we want reduced RF params = {"evcache.copies": copies_per_region} _upsert_params(cluster, params) @@ -239,9 +266,11 @@ def reserve_memory(instance_mem_gib): ) ec2_cost = zones_per_region * cluster.annual_cost + spread_cost = calculate_spread_cost(cluster.count) # Account for the clusters and replication costs - evcache_costs = {"evcache.zonal-clusters": ec2_cost} + evcache_costs = {"evcache.zonal-clusters": ec2_cost, "evcache.spread.cost": spread_cost} + for s in services: evcache_costs[f"{s.service_type}"] = s.annual_cost @@ -354,6 +383,19 @@ def default_desires(user_desires, extra_model_arguments: Dict[str, Any]): f"User asked for {key}={value}" ) + estimated_read_size: Interval = Interval( + **user_desires.query_pattern.dict(exclude_unset=True).get( + "estimated_mean_read_size_bytes", + dict(low=16, mid=1024, high=65536, confidence=0.95), + ) + ) + estimated_read_latency_ms: Interval = Interval( + low=calculate_read_cpu_time_evcache_ms(estimated_read_size.low), + mid=calculate_read_cpu_time_evcache_ms(estimated_read_size.mid), + high=calculate_read_cpu_time_evcache_ms(estimated_read_size.high), + confidence=estimated_read_size.confidence, + ) + if user_desires.query_pattern.access_pattern == AccessPattern.latency: return CapacityDesires( query_pattern=QueryPattern( @@ -367,20 +409,16 @@ def default_desires(user_desires, extra_model_arguments: Dict[str, Any]): target_consistency=AccessConsistency.never ), ), - estimated_mean_read_size_bytes=Interval( - low=128, mid=1024, high=65536, confidence=0.95 - ), + estimated_mean_read_size_bytes=estimated_read_size, estimated_mean_write_size_bytes=Interval( low=64, mid=512, high=1024, confidence=0.95 ), - # memcache point queries usually take just around 100us - # of on CPU time for reads and writes. Memcache is very - # fast - estimated_mean_read_latency_ms=Interval( - low=0.01, mid=0.1, high=0.2, confidence=0.98 - ), + # evcache read latency is sensitive to payload size + # so this is computed above + estimated_mean_read_latency_ms=estimated_read_latency_ms, + # evcache bulk puts usually take slightly longer estimated_mean_write_latency_ms=Interval( - low=0.01, mid=0.1, high=0.2, confidence=0.98 + low=0.01, mid=0.01, high=0.01, confidence=0.98 ), # Assume point queries, "1 millisecond SLO" read_latency_slo_ms=FixedInterval( @@ -406,7 +444,7 @@ def default_desires(user_desires, extra_model_arguments: Dict[str, Any]): low=10, mid=100, high=600, confidence=0.98 ), # (Arun): The management sidecar takes 512 MiB - reserved_instance_app_mem_gib=0.5, + reserved_instance_app_mem_gib=1, # account for the memcached connection memory # and system requirements. # (Arun) We currently use 1 GiB for connection memory @@ -415,7 +453,6 @@ def default_desires(user_desires, extra_model_arguments: Dict[str, Any]): ) else: return CapacityDesires( - # (FIXME): Need to pair with memcache folks on the exact values query_pattern=QueryPattern( access_pattern=AccessPattern.throughput, access_consistency=GlobalConsistency( @@ -427,19 +464,16 @@ def default_desires(user_desires, extra_model_arguments: Dict[str, Any]): target_consistency=AccessConsistency.never ), ), - estimated_mean_read_size_bytes=Interval( - low=128, mid=1024, high=65536, confidence=0.95 - ), + estimated_mean_read_size_bytes=estimated_read_size, estimated_mean_write_size_bytes=Interval( low=128, mid=1024, high=65536, confidence=0.95 ), - # evcache bulk reads usually take slightly longer - estimated_mean_read_latency_ms=Interval( - low=0.01, mid=0.15, high=0.3, confidence=0.98 - ), + # evcache read latency is sensitive to payload size + # so this is computed above + estimated_mean_read_latency_ms=estimated_read_latency_ms, # evcache bulk puts usually take slightly longer estimated_mean_write_latency_ms=Interval( - low=0.01, mid=0.15, high=0.3, confidence=0.98 + low=0.01, mid=0.01, high=0.01, confidence=0.98 ), # Assume they're multi-getting -> slow reads read_latency_slo_ms=FixedInterval( @@ -466,7 +500,7 @@ def default_desires(user_desires, extra_model_arguments: Dict[str, Any]): low=10, mid=100, high=1000, confidence=0.98 ), # (Arun): The management sidecar takes 512 MiB - reserved_instance_app_mem_gib=0.5, + reserved_instance_app_mem_gib=1, # account for the memcached connection memory # and system requirements. # (Arun) We currently use 1 GiB base for connection memory diff --git a/tests/netflix/test_evcache.py b/tests/netflix/test_evcache.py index bf3b688..966328d 100644 --- a/tests/netflix/test_evcache.py +++ b/tests/netflix/test_evcache.py @@ -1,294 +1,298 @@ from service_capacity_modeling.capacity_planner import planner -from service_capacity_modeling.interface import AccessPattern, certain_float from service_capacity_modeling.interface import CapacityDesires from service_capacity_modeling.interface import DataShape from service_capacity_modeling.interface import Interval from service_capacity_modeling.interface import QueryPattern - - -def test_evcache_high_qps(): - qps = 100_000 - high_qps = CapacityDesires( +from service_capacity_modeling.models.org.netflix.evcache import ( + calculate_read_cpu_time_evcache_ms, +) + + +def test_evcache_read_latency(): + # 256 bits = 32 bytes 10 + small = calculate_read_cpu_time_evcache_ms(32) + # 1600 bits = 200 bytes 41 + medium = calculate_read_cpu_time_evcache_ms(200) + # 8192 bits = 1024 bytes 66 + large = calculate_read_cpu_time_evcache_ms(1024) + # 24 KiB = 133 + very_large = calculate_read_cpu_time_evcache_ms(24 * 1024) + # 40 KiB = 158 + extra_large = calculate_read_cpu_time_evcache_ms(40 * 1024) + + assert calculate_read_cpu_time_evcache_ms(1) > 0 + assert 0.008 < small < 0.015 + assert 0.030 < medium < 0.050 + assert 0.060 < large < 0.080 + assert 0.120 < very_large < 0.140 + assert 0.140 < extra_large < 0.160 + + +def test_evcache_inmemory_low_latency_reads_cpu(): + inmemory_cluster_low_latency_reads_qps = CapacityDesires( service_tier=1, query_pattern=QueryPattern( estimated_read_per_second=Interval( - low=qps // 10, mid=qps, high=qps * 10, confidence=0.98 + low=18300000, mid=34200000, high=34200000 * 1.2, confidence=1.0 ), estimated_write_per_second=Interval( - low=qps // 10, mid=qps, high=qps * 10, confidence=0.98 + low=228000, mid=536000, high=536000 * 1.2, confidence=1.0 ), - estimated_write_size_bytes=Interval( - low=10, mid=100, high=1000, confidence=0.98 + estimated_mean_write_size_bytes=Interval( + low=3778, mid=3778, high=3778 * 1.2, confidence=1.0 + ), + estimated_mean_read_size_bytes=Interval( + low=35, mid=35, high=35 * 1.2, confidence=1.0 ), ), data_shape=DataShape( - estimated_state_size_gib=Interval( - low=10, mid=100, high=1000, confidence=0.98 - ), + estimated_state_size_gib=Interval(low=36, mid=36, high=36, confidence=1.0), estimated_state_item_count=Interval( - low=10, mid=100, high=1000, confidence=0.98 + low=416000000, mid=804000000, high=804000000 * 1.2, confidence=1.0 ), ), ) - plan = planner.plan( + + plan = planner.plan_certain( model_name="org.netflix.evcache", region="us-east-1", - desires=high_qps, + desires=inmemory_cluster_low_latency_reads_qps, ) - assert len(plan.least_regret) >= 2 + for candidate in plan: + total_cpu_power = candidate.candidate_clusters.zonal[0].count * \ + candidate.candidate_clusters.zonal[0].instance.cpu * \ + candidate.candidate_clusters.zonal[0].instance.cpu_ghz - lr = plan.least_regret[0] - # EVCache should regret having too little RAM, disk and spending too much - assert all(k in lr.requirements.regrets for k in ("spend", "mem", "disk")) + assert total_cpu_power > 1100 + +def test_evcache_inmemory_medium_latency_reads_cpu(): + inmemory_cluster_medium_latency_reads_qps = CapacityDesires( + service_tier=0, + query_pattern=QueryPattern( + estimated_read_per_second=Interval( + low=470000, mid=1800000, high=1800000 * 1.2, confidence=1.0 + ), + estimated_mean_write_per_second=Interval( + low=505000, mid=861000, high=861000 * 1.2, confidence=1.0 + ), + estimated_mean_write_size_bytes=Interval( + low=365, mid=365, high=365 * 1.2, confidence=1.0 + ), + estimated_mean_read_size_bytes=Interval( + low=193, mid=193, high=193 * 1.2, confidence=1.0 + ), + ), + data_shape=DataShape( + estimated_state_size_gib=Interval(low=61, mid=61, high=61, confidence=1.0), + estimated_state_item_count=Interval( + low=125000000, mid=202000000, high=202000000 * 1.2, confidence=1.0 + ), + ), + ) - # EVCache should be pretty cheap for 100k QPS - assert lr.candidate_clusters.annual_costs["evcache.zonal-clusters"] < 10000 - # Without replication shouldn't have network costs - assert len(lr.candidate_clusters.annual_costs.keys()) == 1 + plan = planner.plan_certain( + model_name="org.netflix.evcache", + region="us-east-1", + desires=inmemory_cluster_medium_latency_reads_qps, + ) - zc = lr.candidate_clusters.zonal[0] + for candidate in plan: + total_cpu_power = candidate.candidate_clusters.zonal[0].count * \ + candidate.candidate_clusters.zonal[0].instance.cpu * \ + candidate.candidate_clusters.zonal[0].instance.cpu_ghz - if zc.instance.drive is not None: - # If we end up with disk we want at least 100 GiB of disk per zone - assert zc.count * zc.instance.drive.size_gib > 100 - else: - # If we end up with RAM we want at least 100 GiB of ram per zone - assert zc.count * zc.instance.ram_gib > 100 + assert total_cpu_power > 400 -def test_evcache_large_data(): - qps = 10_000 - large_data = CapacityDesires( - service_tier=1, +def test_evcache_inmemory_high_latency_reads_cpu(): + inmemory_cluster_high_latency_reads_qps = CapacityDesires( + service_tier=0, query_pattern=QueryPattern( estimated_read_per_second=Interval( - low=qps // 10, mid=qps, high=qps * 10, confidence=0.98 + low=113000, mid=441000, high=441000 * 1.2, confidence=1.0 ), estimated_write_per_second=Interval( - low=qps // 10, mid=qps, high=qps * 10, confidence=0.98 + low=19000, mid=35000, high=35000 * 1.2, confidence=1.0 ), - estimated_write_size_bytes=Interval( - low=1000, mid=5000, high=10_000, confidence=0.98 + estimated_mean_write_size_bytes=Interval( + low=7250, mid=7250, high=7250 * 1.2, confidence=1.0 + ), + estimated_mean_read_size_bytes=Interval( + low=5100, mid=5100, high=5100 * 1.2, confidence=1.0 ), ), data_shape=DataShape( - estimated_state_size_gib=Interval( - low=100, mid=5000, high=10_000, confidence=0.98 - ) + estimated_state_size_gib=Interval(low=1662, mid=1662, high=1662, confidence=1.0), + estimated_state_item_count=Interval( + low=750000000, mid=750000000, high=750000000 * 1.2, confidence=1.0 + ), ), ) - plan = planner.plan( - model_name="org.netflix.evcache", - region="us-east-1", - desires=large_data, - ) - - assert len(plan.least_regret) >= 1 - - lr = plan.least_regret[0] - # EVCache should regret having too little RAM, disk and spending too much - assert all(k in lr.requirements.regrets for k in ("spend", "mem", "disk")) - - # EVCache should be somewhat expensive due to the large amount of data - assert lr.candidate_clusters.annual_costs["evcache.zonal-clusters"] > 10_000 - # Without replication shouldn't have network costs - assert len(lr.candidate_clusters.annual_costs.keys()) == 1 - zc = lr.candidate_clusters.zonal[0] + plan = planner.plan_certain( + model_name="org.netflix.evcache", + region="us-east-1", + desires=inmemory_cluster_high_latency_reads_qps, + ) - # For the sheer volume of data, it probably doesn't make sense for the least regretful cluster to not have disk. - assert zc.instance.drive is not None + for candidate in plan: + total_cpu_power = candidate.candidate_clusters.zonal[0].count * \ + candidate.candidate_clusters.zonal[0].instance.cpu * \ + candidate.candidate_clusters.zonal[0].instance.cpu_ghz - # We want at least 1 TiB of disk per zone - assert zc.count * zc.instance.drive.size_gib > 1000 + assert total_cpu_power > 100 -def test_evcache_replication(): - high_qps = CapacityDesires( - service_tier=1, +def test_evcache_ondisk_low_latency_reads_cpu(): + ondisk_cluster_low_latency_reads_qps = CapacityDesires( + service_tier=0, query_pattern=QueryPattern( - access_pattern=AccessPattern.latency, estimated_read_per_second=Interval( - low=10_000, mid=100_000, high=1_000_000, confidence=0.98 + low=284, mid=7110000, high=7110000 * 1.2, confidence=1.0 ), estimated_write_per_second=Interval( - low=10_000, mid=100_000, high=1_000_000, confidence=0.98 + low=0, mid=2620000, high=2620000 * 1.2, confidence=1.0 + ), + estimated_mean_write_size_bytes=Interval( + low=12000, mid=12000, high=12000 * 1.2, confidence=1.0 + ), + estimated_mean_read_size_bytes=Interval( + low=16000, mid=16000, high=16000 * 1.2, confidence=1.0 ), ), - # This should work out to around 200 GiB of state data_shape=DataShape( + estimated_state_size_gib=Interval(low=2306867, mid=2306867, high=2306867, confidence=1.0), estimated_state_item_count=Interval( - low=100_000_000, mid=1_000_000_000, high=10_000_000_000, confidence=0.98 - ) + low=132000000000, mid=132000000000, high=132000000000 * 1.2, confidence=1.0 + ), ), ) - plan = planner.plan( - model_name="org.netflix.evcache", - region="us-east-1", - desires=high_qps, - num_regions=3, - extra_model_arguments={"cross_region_replication": "sets"}, + + plan = planner.plan_certain( + model_name="org.netflix.evcache", + region="us-east-1", + desires=ondisk_cluster_low_latency_reads_qps, ) - assert len(plan.least_regret) >= 2 - lr = plan.least_regret[0] - # EVCache should regret having too little RAM, disk and spending too much - assert all(k in lr.requirements.regrets for k in ("spend", "mem", "disk")) - assert lr.requirements.zonal[0].disk_gib.mid > 200 + for candidate in plan: + total_cpu_power = candidate.candidate_clusters.zonal[0].count * \ + candidate.candidate_clusters.zonal[0].instance.cpu * \ + candidate.candidate_clusters.zonal[0].instance.cpu_ghz - # EVCache compute should be pretty cheap for 100k RPS with 10k WPS - assert lr.candidate_clusters.annual_costs["evcache.zonal-clusters"] < 10000 + assert total_cpu_power > 8000 - set_inter_region = lr.candidate_clusters.annual_costs["evcache.net.inter.region"] - # With replication should have network costs - assert 10000 < set_inter_region < 40000 - assert ( - 50000 < lr.candidate_clusters.annual_costs["evcache.net.intra.region"] < 120000 +def test_evcache_ondisk_high_latency_reads_cpu(): + ondisk_cluster_high_latency_reads_qps = CapacityDesires( + service_tier=0, + query_pattern=QueryPattern( + estimated_read_per_second=Interval( + low=312000, mid=853000, high=853000 * 1.2, confidence=1.0 + ), + estimated_write_per_second=Interval( + low=0, mid=310000, high=310000 * 1.2, confidence=1.0 + ), + estimated_write_size_bytes=Interval( + low=34500, mid=34500, high=34500 * 1.2, confidence=1.0 + ), + estimated_mean_read_size_bytes=Interval( + low=41000, mid=41000, high=41000 * 1.2, confidence=1.0 + ), + ), + data_shape=DataShape( + estimated_state_size_gib=Interval(low=281000, mid=281000, high=281000, confidence=1.0), + estimated_state_item_count=Interval( + low=8518318523, mid=8518318523, high=8518318523 * 1.2, confidence=1.0 + ), + ), ) - delete_plan = planner.plan( - model_name="org.netflix.evcache", - region="us-east-1", - desires=high_qps, - num_regions=3, - extra_model_arguments={ - "cross_region_replication": "evicts", - "copies_per_region": 3, - }, + plan = planner.plan_certain( + model_name="org.netflix.evcache", + region="us-east-1", + desires=ondisk_cluster_high_latency_reads_qps, ) - lr = delete_plan.least_regret[0] + for candidate in plan: + total_cpu_power = candidate.candidate_clusters.zonal[0].count * \ + candidate.candidate_clusters.zonal[0].instance.cpu * \ + candidate.candidate_clusters.zonal[0].instance.cpu_ghz - # Evicts should be cheaper than sets - evict_inter_region = lr.candidate_clusters.annual_costs["evcache.net.inter.region"] - assert evict_inter_region < set_inter_region - - # With replication should have network costs - assert 5000 < evict_inter_region < 15000 - assert ( - 12000 < lr.candidate_clusters.annual_costs["evcache.net.intra.region"] < 40000 - ) + assert total_cpu_power > 800 -def test_evcache_compare_working_sets(): - small = CapacityDesires( - service_tier=2, +def test_evcache_inmemory_ram_usage(): + inmemory_qps= CapacityDesires( + service_tier=1, query_pattern=QueryPattern( estimated_read_per_second=Interval( - low=10_000, mid=100_000, high=1_000_000, confidence=0.98 + low=18300000, mid=34200000, high=34200000 * 1.2, confidence=1.0 ), estimated_write_per_second=Interval( - low=10_000, mid=100_000, high=1_000_000, confidence=0.98 + low=228000, mid=536000, high=536000 * 1.2, confidence=1.0 + ), + estimated_mean_write_size_bytes=Interval( + low=3778, mid=3778, high=3778 * 1.2, confidence=1.0 + ), + estimated_mean_read_size_bytes=Interval( + low=35, mid=35, high=35 * 1.2, confidence=1.0 ), - estimated_write_size_bytes=Interval( - low=10, mid=100, high=1000, confidence=0.98 - ) ), data_shape=DataShape( - estimated_state_size_gib=Interval( - low=10, mid=100, high=1000, confidence=0.98 + estimated_state_size_gib=Interval(low=36, mid=36, high=36, confidence=1.0), + estimated_state_item_count=Interval( + low=416000000, mid=804000000, high=804000000 * 1.2, confidence=1.0 ), - estimated_working_set_percent=certain_float(0.10) ), ) - large = small.copy(deep=True) - large.data_shape.estimated_working_set_percent = certain_float(0.90) - plan_small = planner.plan( + plan = planner.plan_certain( model_name="org.netflix.evcache", region="us-east-1", - desires=small, + desires=inmemory_qps, ) - plan_large = planner.plan( - model_name="org.netflix.evcache", - region="us-east-1", - desires=large, - ) - - assert len(plan_small.least_regret) >= 2 - assert len(plan_large.least_regret) >= 2 - - lr_small = plan_small.least_regret[0] - lr_large = plan_large.least_regret[0] - - # Only the plan whose desires contain the smaller working set percentage should care about disk. - assert all(k in lr_small.requirements.regrets for k in ("spend", "mem", "disk")) - assert all(k in lr_large.requirements.regrets for k in ("spend", "mem")) - - # Smaller working set percentage should lead to fewer costs. - assert lr_small.candidate_clusters.annual_costs["evcache.zonal-clusters"] < \ - lr_large.candidate_clusters.annual_costs["evcache.zonal-clusters"] - # The large difference in working set percentage should lead to a difference in RAM. - assert lr_small.candidate_clusters.zonal[0].instance.ram_gib < \ - lr_large.candidate_clusters.zonal[0].instance.ram_gib + for candidate in plan: + total_ram = candidate.candidate_clusters.zonal[0].instance.ram_gib * \ + candidate.candidate_clusters.zonal[0].count - # The small working set percentage should lead to picking an instance with both memory and disk. - assert lr_small.candidate_clusters.zonal[0].instance.drive is not None - assert lr_small.candidate_clusters.zonal[0].instance.ram_gib is not None + assert total_ram > inmemory_qps.data_shape.estimated_state_size_gib.mid - # The large working set percentage should lead to only memory (no disk). - assert lr_large.candidate_clusters.zonal[0].instance.drive is None - assert lr_large.candidate_clusters.zonal[0].instance.ram_gib is not None - # Without replication shouldn't have network costs - assert len(lr_small.candidate_clusters.annual_costs.keys()) == 1 - assert len(lr_large.candidate_clusters.annual_costs.keys()) == 1 - - -def test_evcache_compare_tiers(): - low = CapacityDesires( - service_tier=0, +def test_evcache_ondisk_disk_usage(): + inmemory_qps= CapacityDesires( + service_tier=1, query_pattern=QueryPattern( estimated_read_per_second=Interval( - low=10_000, mid=100_000, high=1_000_000, confidence=0.98 + low=18300000, mid=34200000, high=34200000 * 1.2, confidence=1.0 ), estimated_write_per_second=Interval( - low=10_000, mid=100_000, high=1_000_000, confidence=0.98 + low=228000, mid=536000, high=536000 * 1.2, confidence=1.0 + ), + estimated_mean_write_size_bytes=Interval( + low=3778, mid=3778, high=3778 * 1.2, confidence=1.0 + ), + estimated_mean_read_size_bytes=Interval( + low=35, mid=35, high=35 * 1.2, confidence=1.0 ), - estimated_write_size_bytes=Interval( - low=10, mid=100, high=1000, confidence=0.98 - ) ), data_shape=DataShape( - estimated_state_size_gib=Interval( - low=10, mid=100, high=1000, confidence=0.98 + estimated_state_size_gib=Interval(low=36, mid=36, high=36, confidence=1.0), + estimated_state_item_count=Interval( + low=416000000, mid=804000000, high=804000000 * 1.2, confidence=1.0 ), ), ) - high = low.copy(deep=True) - high.service_tier = 3 - plan_low = planner.plan( - model_name="org.netflix.evcache", - region="us-east-1", - desires=low, - ) - plan_high = planner.plan( + plan = planner.plan_certain( model_name="org.netflix.evcache", region="us-east-1", - desires=high, + desires=inmemory_qps, ) - assert len(plan_low.least_regret) >= 2 - assert len(plan_high.least_regret) >= 2 - - lr_low = plan_low.least_regret[0] - lr_high = plan_high.least_regret[0] - - # EVCache should regret having too little RAM, disk and spending too much - assert all(k in lr_low.requirements.regrets for k in ("spend", "mem", "disk")) - assert all(k in lr_high.requirements.regrets for k in ("spend", "mem", "disk")) - - # Lower tier should lead to greater costs. - assert lr_low.candidate_clusters.annual_costs["evcache.zonal-clusters"] > \ - lr_high.candidate_clusters.annual_costs["evcache.zonal-clusters"] - - # Large difference in tiers should lead to different instance family types. - assert lr_low.candidate_clusters.zonal[0].instance.family != lr_high.candidate_clusters.zonal[0].instance.family + for candidate in plan: + total_ram = candidate.candidate_clusters.zonal[0].instance.ram_gib * \ + candidate.candidate_clusters.zonal[0].count - # Without replication shouldn't have network costs - assert len(lr_low.candidate_clusters.annual_costs.keys()) == 1 - assert len(lr_high.candidate_clusters.annual_costs.keys()) == 1 + assert total_ram > inmemory_qps.data_shape.estimated_state_size_gib.mid \ No newline at end of file diff --git a/tests/netflix/test_key_value.py b/tests/netflix/test_key_value.py index 2cbdd9a..fb90169 100644 --- a/tests/netflix/test_key_value.py +++ b/tests/netflix/test_key_value.py @@ -283,10 +283,10 @@ def test_kv_plus_evcache_rps_exceeding_250k(): assert zlr_ev.instance.family[0] in ("r", "m", "i") # Validate EVCache cost for 300k RPS + 300k WPS - assert least_regret_clusters.annual_costs["evcache.zonal-clusters"] < 10000 + assert least_regret_clusters.annual_costs["evcache.zonal-clusters"] < 30000 # Costs for KV + C* + EVCache clusters, including networking for C* - assert len(least_regret_clusters.annual_costs.keys()) == 6 + assert len(least_regret_clusters.annual_costs.keys()) == 7 def test_kv_plus_evcache_rps_exceeding_100k_and_sufficient_read_write_ratio(): @@ -386,10 +386,10 @@ def test_kv_plus_evcache_rps_exceeding_100k_and_sufficient_read_write_ratio(): assert zlr_ev.instance.family[0] in ("r", "m", "i") # Validate EVCache cost for 300k RPS + 300k WPS - assert least_regret_clusters.annual_costs["evcache.zonal-clusters"] < 10000 + assert least_regret_clusters.annual_costs["evcache.zonal-clusters"] < 30000 # Costs for KV + C* + EVCache clusters, including networking for C* - assert len(least_regret_clusters.annual_costs.keys()) == 6 + assert len(least_regret_clusters.annual_costs.keys()) == 7 def test_kv_rps_exceeding_100k_but_insufficient_read_write_ratio():