# 1. Environment variable setupexportPYTHON_INCLUDE_PATH="$(python3-c'from sysconfig import get_paths; print(get_paths()["include"])')"exportPYTHON_LIB_PATH="$(python3-c'from sysconfig import get_paths; print(get_paths()["include"])')"exportPYTORCH_NPU_INSTALL_PATH=/usr/local/libtorch_npu/# NPU version PyTorch pathexportPYTORCH_INSTALL_PATH="$(python3-c'import torch, os; print(os.path.dirname(os.path.abspath(torch.__file__)))')"# PyTorch installation pathexportLIBTORCH_ROOT="$PYTORCH_INSTALL_PATH"# LibTorch pathexportLD_LIBRARY_PATH=/usr/local/libtorch_npu/lib:$LD_LIBRARY_PATH# Add NPU library path# 2. Load npu environmentsource/usr/local/Ascend/ascend-toolkit/set_env.shsource/usr/local/Ascend/nnal/atb/set_env.shexportASCEND_RT_VISIBLE_DEVICES=10,11
exportASDOPS_LOG_TO_STDOUT=1exportASDOPS_LOG_LEVEL=ERROR
exportASDOPS_LOG_TO_FILE=1exportPYTORCH_NPU_ALLOC_CONF=expandable_segments:True
exportNPU_MEMORY_FRACTION=0.98
exportATB_WORKSPACE_MEM_ALLOC_ALG_TYPE=3exportATB_WORKSPACE_MEM_ALLOC_GLOBAL=1exportOMP_NUM_THREADS=12exportHCCL_CONNECT_TIMEOUT=7200exportINF_NAN_MODE_ENABLE=0# 3. Clean up old logs\rm-rfcore.*
# 4. Start distributed serviceMODEL_PATH="/path/to/your/Qwen2-7B-Instruct"# Model pathMASTER_NODE_ADDR="127.0.0.1:9748"# Master node address (must be globally consistent)START_PORT=18000# Service starting portSTART_DEVICE=0# Starting NPU logical device numberLOG_DIR="log"# Log directoryNNODES=2# Number of nodes (this script starts 2 processes)exportHCCL_IF_BASE_PORT=43432# HCCL communication base portexportFOLLY_DEBUG_MEMORYIDLER_DISABLE_UNMAP=1# Disable memory release (improves stability)for((i=0;i<$NNODES;i++))doPORT=$((START_PORT+i))DEVICE=$((START_DEVICE+i))LOG_FILE="$LOG_DIR/node_$i.log"./xllm/build/xllm/core/server/xllm\--model$MODEL_PATH\--devices="npu:$DEVICE"\--port$PORT\--master_node_addr=$MASTER_NODE_ADDR\--nnodes=$NNODES\--max_memory_utilization=0.86\--max_tokens_per_batch=40000\--max_seqs_per_batch=256\--enable_mla=false\--block_size=128\--communication_backend="hccl"\--enable_prefix_cache=false\--enable_chunked_prefill=true\--enable_schedule_overlap=true\--node_rank=$i&done
Two nodes are used here, which can be configured using --nnodes=$NNODES and --node_rank=$i. NPU Device can also be set using the ASCEND_RT_VISIBLE_DEVICES environment variable.
The client test command is the same as in the previous chapter Client Invocation.