jointserve

scheduling/multi_turn_example.py 是启动全局路由器 scheduling/myclient.py 是用户端先运行sglang实例，之后在spec_scheduling/joint/my_scheduler.py 修改下面几个实例的链接

modelscope download --model Qwen/Qwen2.5-14B --local_dir /root/autodl-tmp/Qwen2.5-14B
modelscope download --model Qwen/Qwen2.5-3B --local_dir /root/autodl-tmp/Qwen2.5-3B
pip install modelscope
modelscope download --model deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --local_dir /hy-tmp/DeepSeek-R1-Distill-Qwen-7B

self.api_prefix_urls = [
            "http://localhost:8080",
            # "http://localhost:8090",
            # "http://localhost:8060",
            # "http://localhost:8050",
            #"http://i-2.gpushare.com:44139"
            #"http://i-2.gpushare.com:45173",
            # "http://i-2.gpushare.com:44644",
            #"http://i-2.gpushare.com:29954",
        ]

启动全局调度器

conda activate specreason
unset PYTHONPATH
export no_proxy="localhost,127.0.0.1,0.0.0.0,::1"
conda activate vllm
python3 scheduling/myclient.py --strategy shortest_count 
python3 -m spec_scheduling.myclient --strategy shortest_count --port 8080
改gmodels =  ["/hy-tmp/Qwen2.5-3B-Instruct","/hy-tmp/DeepSeek-R1-Distill-Qwen-7B"]

启动测试

python3 scheduling/multi_turn_example.py --strategy shortest_count 
python3 -m spec_scheduling.multi_turn_example --strategy qos_aware --data_type prediction_results --data_path /hy-tmp/jointrouter/Route-To-Reason-main/results/prediction_results_with_bert.csv
python -m spec_scheduling.benchmarks.benchmark_serving         --backend vllm         --model /hy-tmp/DeepSeek-R1-Distill-Qwen-7B  --dataset-name sharegpt    --dataset-path /data/sharegpt.json      --num-prompts 10


CUDA_VISIBLE_DEVICES=0 VLLM_USE_V1=1 nohup vllm serve /hy-tmp/Qwen2.5-3B-Instruct --dtype auto -tp 1  --gpu-memory-utilization 0.75  --port 30000 --max-model-len 10600 --enable-timeout-notification --request-timeout 30.0 --load-balancer-url "http://localhost:8080" &> qwen3B.log &
CUDA_VISIBLE_DEVICES=1 VLLM_USE_V1=1 nohup vllm serve /hy-tmp/Qwen2.5-7B-Instruct --dtype auto -tp 1  --gpu-memory-utilization 0.75  --port 30001 --max-model-len 10600 --enable-timeout-notification --request-timeout 30.0 --load-balancer-url "http://localhost:8080" & > qwen7B.log &
CUDA_VISIBLE_DEVICES=2 VLLM_USE_V1=1 nohup vllm serve /hy-tmp/DeepSeek-R1-Distill-Qwen-7B --dtype auto -tp 1  --gpu-memory-utilization 0.75  --port 30002 --max-model-len 10600 --enable-timeout-notification --request-timeout 30.0 --load-balancer-url "http://localhost:8080" & > deepseek7B.log &

CUDA_VISIBLE_DEVICES=0 VLLM_USE_V1=1  vllm serve /hy-tmp/DeepSeek-R1-Distill-Qwen-1.5B --dtype auto -tp 1  --gpu-memory-utilization 0.45 --port 30001 --enable-prefix-caching --max-model-len 6000  --speculative_config '{"method": "ngram","num_speculative_tokens": 3,"prompt_lookup_max": 2}'
question_id=1 的 model 名称（不重复）:
DeepSeek-R1-Distill-Qwen-14B
DeepSeek-R1-Distill-Qwen-7B
QwQ-32B-AWQ
Qwen2.5-14B-Instruct
Qwen2.5-3B-Instruct
Qwen2.5-7B-Instruct