feat: update disk cache params and benchmark_multiturn.py#1333
feat: update disk cache params and benchmark_multiturn.py#1333blueswhen wants to merge 3 commits into
Conversation
There was a problem hiding this comment.
Code Review
This pull request scales down the default shard, worker, and concurrent write task counts in the disk cache worker, and introduces a session pool execution model to the multi-turn benchmark script to support a larger pool of simulated users with active/silent swapping. The review feedback highlights outdated comments in the disk cache worker and identifies critical issues in run_full_session where progress tracking and failure states are not correctly updated upon unexpected exceptions or session failures.
Important
The consumer version of Gemini Code Assist on GitHub is being sunset. Starting June 18, 2026, new organization installations will be blocked, and all code review activity will officially cease on July 17, 2026.
For more details on the timeline and next steps, please review the Help Documentation.
| # num_shard与KVCACHE_MAX_BLOCK_SIZE相关,KVCACHE_MAX_BLOCK_SIZE默认64MB前提下, | ||
| # num_shard设置32, 能使disk cache的容量利用率达到90%,继续增大num_shard会导致容量利用率下降 | ||
| num_shard = 32 | ||
| num_worker = 48 | ||
| num_shard = 8 | ||
| num_worker = 24 | ||
| # 读写同时进行时,分配16线程用来写,32线程用来读 | ||
| max_concurrent_write_tasks = 16 | ||
| max_concurrent_write_tasks = 8 |
There was a problem hiding this comment.
The comments on lines 45-46 and 49 are now outdated and misleading because num_shard, num_worker, and max_concurrent_write_tasks have been updated. Please update the comments to reflect the new values (e.g., allocating 8 threads for writing and 16 threads for reading).
| # num_shard与KVCACHE_MAX_BLOCK_SIZE相关,KVCACHE_MAX_BLOCK_SIZE默认64MB前提下, | |
| # num_shard设置32, 能使disk cache的容量利用率达到90%,继续增大num_shard会导致容量利用率下降 | |
| num_shard = 32 | |
| num_worker = 48 | |
| num_shard = 8 | |
| num_worker = 24 | |
| # 读写同时进行时,分配16线程用来写,32线程用来读 | |
| max_concurrent_write_tasks = 16 | |
| max_concurrent_write_tasks = 8 | |
| # num_shard与KVCACHE_MAX_BLOCK_SIZE相关 | |
| num_shard = 8 | |
| num_worker = 24 | |
| # 读写同时进行时,分配8线程用来写,16线程用来读 | |
| max_concurrent_write_tasks = 8 |
| session = SessionState(session_id, tokenizer, start_input_len, max_input_len, max_turns, base_seed) | ||
| per_turn: List[Dict] = [] | ||
| turn_idx = 0 | ||
| failed = False | ||
|
|
||
| try: | ||
| while turn_idx < max_turns and prompt_len < max_input_len: | ||
| turn_output_len = rng.randint(min_output_len, output_len) | ||
| result = stream_one_turn( | ||
| tokenizer=tokenizer, | ||
| url=url, | ||
| model_name=model_name, | ||
| prompt=prompt, | ||
| prompt_token_len=prompt_len, | ||
| max_new_tokens=turn_output_len, | ||
| request_timeout_s=request_timeout_s, | ||
| while not session.is_completed(): | ||
| turn_output_len, result = request_session_turn( | ||
| session, | ||
| tokenizer, | ||
| url, | ||
| model_name, | ||
| min_output_len, | ||
| output_len, | ||
| request_timeout_s, | ||
| ) | ||
| if result is None: | ||
| failed = True | ||
| break | ||
|
|
||
| per_turn.append(result) | ||
| session.turn_idx += 1 | ||
| session_completed = session.is_completed() | ||
|
|
||
| with progress_lock: | ||
| progress_state["finished_turns"] += 1 | ||
| print( | ||
| f"\rconc={progress_state['concurrency']} " | ||
| f"finished_turns={progress_state['finished_turns']} " | ||
| f"active_sessions={progress_state['active_sessions']}\033[K", | ||
| end="", | ||
| flush=True, | ||
| progress_state["prompt_tokens_total"] += result["prompt_tokens"] | ||
| progress_state["cached_tokens_total"] += result["cached_tokens"] | ||
| if result.get("cached_tokens_reported"): | ||
| progress_state["cached_reported_turns"] += 1 | ||
|
|
||
| print_progress_line( | ||
| concurrency=progress_state["concurrency"], | ||
| finished_turns=progress_state["finished_turns"], | ||
| completed_sessions=progress_state["completed_sessions"] + int(session_completed), | ||
| session_num=progress_state["session_num"], | ||
| active_sessions=progress_state["active_sessions"] - int(session_completed), | ||
| prompt_tokens_total=progress_state["prompt_tokens_total"], | ||
| cached_tokens_total=progress_state["cached_tokens_total"], | ||
| cached_reported_turns=progress_state["cached_reported_turns"], | ||
| failed_sessions=progress_state["failed_sessions"], | ||
| wall_start=progress_state["wall_start"], | ||
| ) | ||
| turn_input_len = rng.randint(min_turn_input_increment, turn_input_increment) | ||
| prompt, prompt_len = append_turn_input( | ||
|
|
||
| if session_completed: | ||
| break | ||
|
|
||
| session.prompt, session.prompt_len = build_next_prompt( | ||
| session, | ||
| tokenizer, | ||
| prompt, | ||
| prompt_len, | ||
| turn_output_len, | ||
| turn_input_len, | ||
| rng, | ||
| min_turn_input_increment, | ||
| turn_input_increment, | ||
| ) | ||
| turn_idx += 1 | ||
| finally: | ||
| with progress_lock: | ||
| progress_state["active_sessions"] -= 1 | ||
| if failed: | ||
| progress_state["failed_sessions"] += 1 | ||
| else: | ||
| progress_state["completed_sessions"] += 1 |
There was a problem hiding this comment.
There are two issues with the progress tracking and failure handling in run_full_session:
- If a session terminates due to an unexpected exception or failure (i.e.,
result is None), the progress line is not updated to reflect the failure or the decrement in active sessions until another session completes a turn. If it is the last session, the console will never show the correct final state. - If an unexpected exception is raised during the session,
failedremainsFalse, causing the session to be incorrectly counted as a completed session instead of a failed one.
We can resolve both issues by initializing a completed flag, setting it to True only upon successful completion, and always calling print_progress_line in the finally block to ensure the console is kept up-to-date.
session = SessionState(session_id, tokenizer, start_input_len, max_input_len, max_turns, base_seed)
per_turn: List[Dict] = []
completed = False
try:
while not session.is_completed():
turn_output_len, result = request_session_turn(
session,
tokenizer,
url,
model_name,
min_output_len,
output_len,
request_timeout_s,
)
if result is None:
break
per_turn.append(result)
session.turn_idx += 1
session_completed = session.is_completed()
with progress_lock:
progress_state["finished_turns"] += 1
progress_state["prompt_tokens_total"] += result["prompt_tokens"]
progress_state["cached_tokens_total"] += result["cached_tokens"]
if result.get("cached_tokens_reported"):
progress_state["cached_reported_turns"] += 1
print_progress_line(
concurrency=progress_state["concurrency"],
finished_turns=progress_state["finished_turns"],
completed_sessions=progress_state["completed_sessions"],
session_num=progress_state["session_num"],
active_sessions=progress_state["active_sessions"],
prompt_tokens_total=progress_state["prompt_tokens_total"],
cached_tokens_total=progress_state["cached_tokens_total"],
cached_reported_turns=progress_state["cached_reported_turns"],
failed_sessions=progress_state["failed_sessions"],
wall_start=progress_state["wall_start"],
)
if session_completed:
break
session.prompt, session.prompt_len = build_next_prompt(
session,
tokenizer,
turn_output_len,
min_turn_input_increment,
turn_input_increment,
)
completed = session.is_completed()
finally:
with progress_lock:
progress_state["active_sessions"] -= 1
if completed:
progress_state["completed_sessions"] += 1
else:
progress_state["failed_sessions"] += 1
print_progress_line(
concurrency=progress_state["concurrency"],
finished_turns=progress_state["finished_turns"],
completed_sessions=progress_state["completed_sessions"],
session_num=progress_state["session_num"],
active_sessions=progress_state["active_sessions"],
prompt_tokens_total=progress_state["prompt_tokens_total"],
cached_tokens_total=progress_state["cached_tokens_total"],
cached_reported_turns=progress_state["cached_reported_turns"],
failed_sessions=progress_state["failed_sessions"],
wall_start=progress_state["wall_start"],
)
No description provided.