update
cd /hy-tmp/jointrouter pip install vllm==0.11.0 CUDA_VISIBLE_DEVICES=0 VLLM_USE_V1=1 vllm serve /hy-tmp/Qwen2.5-3B-Instruct –dtype auto -tp 1 –gpu-memory-utilization 0.75 –port 30000 –max-model-len 10600 –enable-timeout-notification –request-timeout 30.0 –load-balancer-url “http://localhost:8080"
CUDA_VISIBLE_DEVICES=1 VLLM_USE_V1=1 vllm serve /hy-tmp/Qwen2.5-7B-Instruct –dtype auto -tp 1 –gpu-memory-utilization 0.75 –port 30001 –max-model-len 10600 –enable-timeout-notification –request-timeout 30.0 –load-balancer-url “http://localhost:8080"
DeepSeek 7B:
CUDA_VISIBLE_DEVICES=0 VLLM_USE_V1=1 vllm serve /hy-tmp/DeepSeek-R1-Distill-Qwen-7B –dtype auto -tp 1 –gpu-memory-utilization 0.75 –port 8080 –max-model-len 12600 –enable-timeout-notification –request-timeout 30.0 –load-balancer-url “http://i-2.gpushare.com:26224" 注意一件事:my_scheduler.py 里的 api_prefix_urls 必须和你这几个 vLLM 端口对应上,不 然路由器连不到模型。 python3 -m spec_scheduling.myclient –strategy round_robin –port 8080
python3 -m spec_scheduling.multi_turn_example –strategy round_robin –data_type prediction_results –data_path /hy-tmp/jointrouter/Route-To-Reason-main/data/split_data/train.csv –profile –profile_server_index 0 –profile_result_name qwen3b_train/result_qwen3b_train –num_request 1470
加入新的数据集
export MODEL=Meta-Llama-3.1-8B python3 /hy-tmp/humaneval_tools/run_humaneval_local.py –dataset /hy-tmp/HumanEval.jsonl.gz –model-path /hy-tmp/MODEL\ −−model−nameMODEL \ --model-nameMODEL\ −−model−nameMODEL –strategy direct –max-new-tokens 10240 –output /hy-tmp/humaneval_${MODEL}_direct.jsonl
python3 /hy-tmp/humaneval_tools/evaluate_humaneval.py –dataset /hy-tmp/HumanEval.jsonl.gz –generations /hy-tmp/humaneval_qwen3b_direct.jsonl –results-jsonl /hy-tmp/humaneval_qwen3b_direct_results.jsonl –results-csv /hy-tmp/humaneval_qwen3b_direct_test.csv
旧是数据集 生成hidden state 要训练 train python /hy-tmp/jointrouter/exps/get_hiddenstate.py –dataset /hy-tmp/jointrouter/Route-To-Reason-main/data/split_data/train.csv –model-path /hy-tmp/Qwen2.5-3B-Instruct –model-name Qwen2.5-3B –dataset-model-filter Qwen2.5-3B-Instruct –output /hy-tmp/data/route_train_results_Qwen2.5-3B-Instruct.jsonl –token-positions 1,2,4,8,16,last_token –max-new-tokens 4096 –batch-size 8 也要 test nohup python /hy-tmp/jointrouter/exps/get_hiddenstate.py –dataset /hy-tmp/jointrouter/Route-To-Reason-main/data/split_data/test.csv –model-path /hy-tmp/Qwen2.5-3B-Instruct –model-name Qwen2.5-3B –dataset-model-filter Qwen2.5-3B-Instruct –output /hy-tmp/data/route_test_results_Qwen2.5-3B-Instruct.jsonl –token-positions 1,2,4,8,16,last_token –max-new-tokens 4096 –batch-size 8 & 最大长度为2048的话,会不会被截断: 总记录数:2520
python /hy-tmp/jointrouter/exps/route_test_rebuttal_supplement.py –model-name Qwen2.5-7B –input-prefix /hy-tmp/data/route_test_results_hiddentstate_-1.jsonl –output-dir /hy-tmp/jointrouter/exps/outputs
nohup python /hy-tmp/jointrouter/exps/route_test_rebuttal_supplement.py –model-name Qwen2.5-3B-Instruct –train-input-prefix /hy-tmp/data/route_train_results_Qwen2.5-3B-Instruct_hiddentstate_-1.jsonl –test-input-prefix /hy-tmp/data/route_test_results_Qwen2.5-3B-Instruct_hiddentstate_-1.jsonl –output-dir /hy-tmp/jointrouter/exps/outputs –hidden-only &
python /hy-tmp/jointrouter/exps/generate_last_token_index_csv.py 生成index
生成train的index python /hy-tmp/jointrouter/exps/generate_last_token_index_csv.py –hiddenstate-jsonl /hy-tmp/data/route_train_results_Qwen2.5-3B-Instruct_hiddentstate_-1.jsonl –train-predictions-csv /hy-tmp/jointrouter/exps/outputs/rerouter_train_predictions_results_Qwen2.5-3B-Instruct_route_test_token_last_token.csv –test-predictions-csv /hy-tmp/jointrouter/exps/outputs/rerouter_train_predictions_results_Qwen2.5-3B-Instruct_route_test_token_last_token.csv –train-output-csv /hy-tmp/jointrouter/exps/outputs/train_last_token_index_instruct_3B.csv –test-output-csv /tmp/ignore_train_dup.csv 生成test的index python /hy-tmp/jointrouter/exps/generate_last_token_index_csv.py –hiddenstate-jsonl /hy-tmp/data/route_test_results_Qwen2.5-3B-Instruct_hiddentstate_-1.jsonl –train-predictions-csv /hy-tmp/jointrouter/exps/outputs/rerouter_test_predictions_results_Qwen2.5-3B-Instruct_route_test_token_last_token.csv –test-predictions-csv /hy-tmp/jointrouter/exps/outputs/rerouter_test_predictions_results_Qwen2.5-3B-Instruct_route_test_token_last_token.csv –train-output-csv /tmp/ignore_test_dup.csv –test-output-csv /hy-tmp/jointrouter/exps/outputs/test_last_token_index_instruct_3B.csv
将数据集分成train val test python3 /hy-tmp/jointrouter/exps/split_threshold_tuning_data.py –input-predictions-prefix /hy-tmp/jointrouter/exps/outputs/rerouter_train_predictions_results_Qwen2.5-3B-Instruct_route_test –input-last-token-index-csv /hy-tmp/jointrouter/exps/outputs/train_last_token_index_instruct_3B.csv –output-train-predictions-prefix /hy-tmp/jointrouter/exps/outputs/resplit_train_predictions_results_Qwen2.5-3B-Instruct –output-val-predictions-prefix /hy-tmp/jointrouter/exps/outputs/resplit_val_predictions_results_Qwen2.5-3B-Instruct –output-train-last-token-index-csv /hy-tmp/jointrouter/exps/outputs/resplit_train_last_token_index_instruct_3B.csv –output-val-last-token-index-csv /hy-tmp/jointrouter/exps/outputs/resplit_val_last_token_index_instruct_3B.csv –val-ratio 0.2 –seed 42
阈值模拟python3 /hy-tmp/jointrouter/exps/threshold_tuning_simulation.py –train-predictions-prefix=/hy-tmp/jointrouter/exps/outputs/resplit_train_predictions_results_Qwen2.5-3B-Instruct –val-predictions-prefix=/hy-tmp/jointrouter/exps/outputs/resplit_val_predictions_results_Qwen2.5-3B-Instruct –test-predictions-prefix=/hy-tmp/jointrouter/exps/outputs/rerouter_test_predictions_results_Qwen2.5-3B-Instruct_route_test –train-last-token-index-csv=/hy-tmp/jointrouter/exps/outputs/resplit_train_last_token_index_instruct_3B.csv –val-last-token-index-csv=/hy-tmp/jointrouter/exps/outputs/resplit_val_last_token_index_instruct_3B.csv –test-last-token-index-csv=/hy-tmp/jointrouter/exps/outputs/test_last_token_index_instruct_3B.csv –per-token-latency=0.02 –search-objective=accuracy_bounded –target-accuracy=0.99 –small-latency=1.0 –large-latency=5.0 –search-steps=200 –output-json=/hy-tmp/jointrouter/exps/outputs/threshold_tuning_simulation_qwen3b_instruct_with_token_latency_train_val_test.json
版权所有:中国计算机学会技术支持:开源发展技术委员会 京ICP备13000930号-9 京公网安备 11010802047560号
cd /hy-tmp/jointrouter pip install vllm==0.11.0 CUDA_VISIBLE_DEVICES=0 VLLM_USE_V1=1 vllm serve /hy-tmp/Qwen2.5-3B-Instruct
–dtype auto -tp 1 –gpu-memory-utilization 0.75
–port 30000 –max-model-len 10600
–enable-timeout-notification –request-timeout 30.0
–load-balancer-url “http://localhost:8080"
CUDA_VISIBLE_DEVICES=1 VLLM_USE_V1=1 vllm serve /hy-tmp/Qwen2.5-7B-Instruct
–dtype auto -tp 1 –gpu-memory-utilization 0.75
–port 30001 –max-model-len 10600
–enable-timeout-notification –request-timeout 30.0
–load-balancer-url “http://localhost:8080"
DeepSeek 7B:
CUDA_VISIBLE_DEVICES=0 VLLM_USE_V1=1 vllm serve /hy-tmp/DeepSeek-R1-Distill-Qwen-7B
–dtype auto -tp 1 –gpu-memory-utilization 0.75
–port 8080 –max-model-len 12600
–enable-timeout-notification –request-timeout 30.0
–load-balancer-url “http://i-2.gpushare.com:26224" 注意一件事:my_scheduler.py 里的 api_prefix_urls 必须和你这几个 vLLM 端口对应上,不 然路由器连不到模型。 python3 -m spec_scheduling.myclient –strategy round_robin –port 8080
python3 -m spec_scheduling.multi_turn_example
–strategy round_robin
–data_type prediction_results
–data_path /hy-tmp/jointrouter/Route-To-Reason-main/data/split_data/train.csv
–profile
–profile_server_index 0
–profile_result_name qwen3b_train/result_qwen3b_train
–num_request 1470
加入新的数据集
DeepSeek-R1-Distill-Qwen-7B
export MODEL=Meta-Llama-3.1-8B python3 /hy-tmp/humaneval_tools/run_humaneval_local.py
–dataset /hy-tmp/HumanEval.jsonl.gz
–model-path /hy-tmp/MODEL\ −−model−nameMODEL
–strategy direct
–max-new-tokens 10240
–output /hy-tmp/humaneval_${MODEL}_direct.jsonl
python3 /hy-tmp/humaneval_tools/evaluate_humaneval.py
–dataset /hy-tmp/HumanEval.jsonl.gz
–generations /hy-tmp/humaneval_qwen3b_direct.jsonl
–results-jsonl /hy-tmp/humaneval_qwen3b_direct_results.jsonl
–results-csv /hy-tmp/humaneval_qwen3b_direct_test.csv
旧是数据集 生成hidden state 要训练 train python /hy-tmp/jointrouter/exps/get_hiddenstate.py
–dataset /hy-tmp/jointrouter/Route-To-Reason-main/data/split_data/train.csv
–model-path /hy-tmp/Qwen2.5-3B-Instruct
–model-name Qwen2.5-3B
–dataset-model-filter Qwen2.5-3B-Instruct
–output /hy-tmp/data/route_train_results_Qwen2.5-3B-Instruct.jsonl
–token-positions 1,2,4,8,16,last_token
–max-new-tokens 4096
–batch-size 8 也要 test nohup python /hy-tmp/jointrouter/exps/get_hiddenstate.py
–dataset /hy-tmp/jointrouter/Route-To-Reason-main/data/split_data/test.csv
–model-path /hy-tmp/Qwen2.5-3B-Instruct
–model-name Qwen2.5-3B
–dataset-model-filter Qwen2.5-3B-Instruct
–output /hy-tmp/data/route_test_results_Qwen2.5-3B-Instruct.jsonl
–token-positions 1,2,4,8,16,last_token
–max-new-tokens 4096
–batch-size 8 & 最大长度为2048的话,会不会被截断: 总记录数:2520
生成不同的预测器的预测结果
python /hy-tmp/jointrouter/exps/route_test_rebuttal_supplement.py –model-name Qwen2.5-7B –input-prefix /hy-tmp/data/route_test_results_hiddentstate_-1.jsonl –output-dir /hy-tmp/jointrouter/exps/outputs
nohup python /hy-tmp/jointrouter/exps/route_test_rebuttal_supplement.py
–model-name Qwen2.5-3B-Instruct
–train-input-prefix /hy-tmp/data/route_train_results_Qwen2.5-3B-Instruct_hiddentstate_-1.jsonl
–test-input-prefix /hy-tmp/data/route_test_results_Qwen2.5-3B-Instruct_hiddentstate_-1.jsonl
–output-dir /hy-tmp/jointrouter/exps/outputs
–hidden-only &
判断预测器结果的效果
python /hy-tmp/jointrouter/exps/generate_last_token_index_csv.py 生成index
生成train的index python /hy-tmp/jointrouter/exps/generate_last_token_index_csv.py
–hiddenstate-jsonl /hy-tmp/data/route_train_results_Qwen2.5-3B-Instruct_hiddentstate_-1.jsonl
–train-predictions-csv /hy-tmp/jointrouter/exps/outputs/rerouter_train_predictions_results_Qwen2.5-3B-Instruct_route_test_token_last_token.csv
–test-predictions-csv /hy-tmp/jointrouter/exps/outputs/rerouter_train_predictions_results_Qwen2.5-3B-Instruct_route_test_token_last_token.csv
–train-output-csv /hy-tmp/jointrouter/exps/outputs/train_last_token_index_instruct_3B.csv
–test-output-csv /tmp/ignore_train_dup.csv 生成test的index python /hy-tmp/jointrouter/exps/generate_last_token_index_csv.py
–hiddenstate-jsonl /hy-tmp/data/route_test_results_Qwen2.5-3B-Instruct_hiddentstate_-1.jsonl
–train-predictions-csv /hy-tmp/jointrouter/exps/outputs/rerouter_test_predictions_results_Qwen2.5-3B-Instruct_route_test_token_last_token.csv
–test-predictions-csv /hy-tmp/jointrouter/exps/outputs/rerouter_test_predictions_results_Qwen2.5-3B-Instruct_route_test_token_last_token.csv
–train-output-csv /tmp/ignore_test_dup.csv
–test-output-csv /hy-tmp/jointrouter/exps/outputs/test_last_token_index_instruct_3B.csv
将数据集分成train val test python3 /hy-tmp/jointrouter/exps/split_threshold_tuning_data.py
–input-predictions-prefix /hy-tmp/jointrouter/exps/outputs/rerouter_train_predictions_results_Qwen2.5-3B-Instruct_route_test
–input-last-token-index-csv /hy-tmp/jointrouter/exps/outputs/train_last_token_index_instruct_3B.csv
–output-train-predictions-prefix /hy-tmp/jointrouter/exps/outputs/resplit_train_predictions_results_Qwen2.5-3B-Instruct
–output-val-predictions-prefix /hy-tmp/jointrouter/exps/outputs/resplit_val_predictions_results_Qwen2.5-3B-Instruct
–output-train-last-token-index-csv /hy-tmp/jointrouter/exps/outputs/resplit_train_last_token_index_instruct_3B.csv
–output-val-last-token-index-csv /hy-tmp/jointrouter/exps/outputs/resplit_val_last_token_index_instruct_3B.csv
–val-ratio 0.2
–seed 42
阈值模拟
python3 /hy-tmp/jointrouter/exps/threshold_tuning_simulation.py –train-predictions-prefix=/hy-tmp/jointrouter/exps/outputs/resplit_train_predictions_results_Qwen2.5-3B-Instruct –val-predictions-prefix=/hy-tmp/jointrouter/exps/outputs/resplit_val_predictions_results_Qwen2.5-3B-Instruct –test-predictions-prefix=/hy-tmp/jointrouter/exps/outputs/rerouter_test_predictions_results_Qwen2.5-3B-Instruct_route_test –train-last-token-index-csv=/hy-tmp/jointrouter/exps/outputs/resplit_train_last_token_index_instruct_3B.csv –val-last-token-index-csv=/hy-tmp/jointrouter/exps/outputs/resplit_val_last_token_index_instruct_3B.csv –test-last-token-index-csv=/hy-tmp/jointrouter/exps/outputs/test_last_token_index_instruct_3B.csv –per-token-latency=0.02 –search-objective=accuracy_bounded –target-accuracy=0.99 –small-latency=1.0 –large-latency=5.0 –search-steps=200 –output-json=/hy-tmp/jointrouter/exps/outputs/threshold_tuning_simulation_qwen3b_instruct_with_token_latency_train_val_test.json