Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +8 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/8k-100.sh +75 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/config.json +56 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_1B.json +29 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_340M.json +26 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_1B.json +22 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_340M.json +22 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gdn_6_1_340M.json +50 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gdn_6_1_340M_bf16.json +50 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gdn_6_nsa_1_340M.json +53 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_340M.json +24 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_7B.json +25 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gsa_340M.json +29 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/hgrn2_340M.json +20 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_1B.json +32 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_340M.json +32 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_6_1_340M.json +50 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_1B.json +30 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_340M.json +30 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/samba_1B.json +52 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/sba_340m.json +18 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_1B.json +22 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_340M.json +18 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_7B.json +21 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/generation_config.json +6 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/0/error.json +1 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/0/stderr.log +573 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/0/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/1/error.json +1 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/1/stderr.log +571 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/1/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/2/error.json +1 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/2/stderr.log +571 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/2/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/3/error.json +1 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/3/stderr.log +571 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/3/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/4/error.json +1 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/4/stderr.log +571 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/4/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/5/error.json +1 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/5/stderr.log +571 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/5/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/6/error.json +1 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/6/stderr.log +571 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/6/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/7/error.json +1 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/7/stderr.log +571 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/7/stdout.log +0 -0
- bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/0/stderr.log +3 -0
.gitattributes
CHANGED
|
@@ -41,3 +41,11 @@ bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_rat
|
|
| 41 |
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 42 |
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 43 |
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 42 |
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 43 |
bf16-gdn_6_nsa_1_340M.json-ctx65536-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs1-nn1-gas2/logs/none_g0y0s4gd/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/0/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/1/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/2/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/3/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/4/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/5/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/6/stderr.log filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/7/stderr.log filter=lfs diff=lfs merge=lfs -text
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/8k-100.sh
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FLAME_PATH=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame
|
| 2 |
+
DATASET_ROOT=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset
|
| 3 |
+
TOKENIZER=/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer
|
| 4 |
+
|
| 5 |
+
cd $FLAME_PATH
|
| 6 |
+
source .venv/bin/activate
|
| 7 |
+
|
| 8 |
+
# =========== train config ===========
|
| 9 |
+
CONFIG=${1:-transformer_340M.json}
|
| 10 |
+
SEQ_LEN=8192
|
| 11 |
+
WARMUP_STEPS=100
|
| 12 |
+
STEPS=95366
|
| 13 |
+
LR=3e-4
|
| 14 |
+
BATCH_SIZE=8
|
| 15 |
+
GAS=2
|
| 16 |
+
DECAY_TYPE=linear
|
| 17 |
+
DECAY_RATIO=1
|
| 18 |
+
NNODE=1
|
| 19 |
+
NGPU=8
|
| 20 |
+
LOG_RANK=0
|
| 21 |
+
EXTRA_ARGS="--training.mixed_precision_param bfloat16"
|
| 22 |
+
EXTRA_NAME="bf16"
|
| 23 |
+
# ====================================
|
| 24 |
+
|
| 25 |
+
# if jq command is not found, install it
|
| 26 |
+
if ! command -v jq &> /dev/null; then
|
| 27 |
+
echo "jq could not be found, installing it..."
|
| 28 |
+
sudo yum install -y jq
|
| 29 |
+
fi
|
| 30 |
+
|
| 31 |
+
export WANDB_ERROR_REPORTING=False
|
| 32 |
+
|
| 33 |
+
if [ -n "$EXTRA_NAME" ]; then
|
| 34 |
+
EXTRA_NAME="${EXTRA_NAME}-"
|
| 35 |
+
fi
|
| 36 |
+
|
| 37 |
+
EXP_NAME=${EXTRA_NAME}$(basename $CONFIG | sed 's/\.config//')-ctx${SEQ_LEN}-steps${STEPS}-lr${LR}-decay_type${DECAY_TYPE}-decay_ratio${DECAY_RATIO}-bs${BATCH_SIZE}-nn${NNODE}-gas${GAS}
|
| 38 |
+
|
| 39 |
+
bash train.sh \
|
| 40 |
+
--job.config_file flame/models/fla.toml \
|
| 41 |
+
--job.dump_folder $FLAME_PATH/exp/$EXP_NAME \
|
| 42 |
+
--model.config $FLAME_PATH/configs/$CONFIG \
|
| 43 |
+
--model.tokenizer_path $TOKENIZER \
|
| 44 |
+
--optimizer.name AdamW \
|
| 45 |
+
--optimizer.eps 1e-8 \
|
| 46 |
+
--optimizer.lr $LR \
|
| 47 |
+
--lr_scheduler.warmup_steps $WARMUP_STEPS \
|
| 48 |
+
--lr_scheduler.lr_min 0.01 \
|
| 49 |
+
--lr_scheduler.decay_type $DECAY_TYPE \
|
| 50 |
+
--lr_scheduler.decay_ratio $DECAY_RATIO \
|
| 51 |
+
--training.batch_size $BATCH_SIZE \
|
| 52 |
+
--training.seq_len $SEQ_LEN \
|
| 53 |
+
--training.context_len $SEQ_LEN \
|
| 54 |
+
--training.gradient_accumulation_steps $GAS \
|
| 55 |
+
--training.steps $STEPS \
|
| 56 |
+
--training.max_norm 1.0 \
|
| 57 |
+
--training.skip_nan_inf \
|
| 58 |
+
--training.dataset $DATASET_ROOT/fineweb-edu-sample,$DATASET_ROOT/small_repos_20B_sample_merged,$DATASET_ROOT/megamath-web-pro \
|
| 59 |
+
--training.data_probs 0.55,0.3,0.15 \
|
| 60 |
+
--training.dataset_split train,train,train \
|
| 61 |
+
--training.dataset_name default,default,default \
|
| 62 |
+
--training.streaming \
|
| 63 |
+
--training.num_workers 32 \
|
| 64 |
+
--training.prefetch_factor 2 \
|
| 65 |
+
--training.seed 42 \
|
| 66 |
+
--training.compile \
|
| 67 |
+
--checkpoint.interval 8192 \
|
| 68 |
+
--checkpoint.load_step -1 \
|
| 69 |
+
--checkpoint.keep_latest_k 100 \
|
| 70 |
+
--checkpoint.export_dtype bfloat16 \
|
| 71 |
+
--metrics.log_freq 1 \
|
| 72 |
+
--metrics.enable_tensorboard \
|
| 73 |
+
--training.streaming \
|
| 74 |
+
${EXTRA_ARGS}
|
| 75 |
+
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/config.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"allow_neg_eigval": false,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"GatedDeltaNetForCausalLM"
|
| 5 |
+
],
|
| 6 |
+
"attn": {
|
| 7 |
+
"block_counts": 16,
|
| 8 |
+
"block_size": 64,
|
| 9 |
+
"layers": [
|
| 10 |
+
5,
|
| 11 |
+
11,
|
| 12 |
+
17,
|
| 13 |
+
23
|
| 14 |
+
],
|
| 15 |
+
"num_heads": 32,
|
| 16 |
+
"num_kv_heads": 2,
|
| 17 |
+
"qkv_bias": false,
|
| 18 |
+
"rope_theta": 160000.0,
|
| 19 |
+
"type": "nsa",
|
| 20 |
+
"window_size": 512
|
| 21 |
+
},
|
| 22 |
+
"attn_mode": "chunk",
|
| 23 |
+
"bos_token_id": 1,
|
| 24 |
+
"conv_size": 4,
|
| 25 |
+
"eos_token_id": 2,
|
| 26 |
+
"expand_k": 1,
|
| 27 |
+
"expand_v": 1,
|
| 28 |
+
"fuse_cross_entropy": true,
|
| 29 |
+
"fuse_norm": true,
|
| 30 |
+
"fuse_swiglu": true,
|
| 31 |
+
"head_dim": 256,
|
| 32 |
+
"hidden_act": "swish",
|
| 33 |
+
"hidden_ratio": 4,
|
| 34 |
+
"hidden_size": 1024,
|
| 35 |
+
"initializer_range": 0.02,
|
| 36 |
+
"intermediate_size": null,
|
| 37 |
+
"max_position_embeddings": 8192,
|
| 38 |
+
"model_type": "gated_deltanet",
|
| 39 |
+
"norm_eps": 1e-06,
|
| 40 |
+
"norm_first": false,
|
| 41 |
+
"num_heads": 4,
|
| 42 |
+
"num_hidden_layers": 24,
|
| 43 |
+
"num_v_heads": null,
|
| 44 |
+
"qk_activation": "silu",
|
| 45 |
+
"qk_norm": "l2",
|
| 46 |
+
"tie_word_embeddings": false,
|
| 47 |
+
"torch_dtype": "bfloat16",
|
| 48 |
+
"transformers_version": "4.53.3",
|
| 49 |
+
"use_beta": true,
|
| 50 |
+
"use_cache": true,
|
| 51 |
+
"use_gate": true,
|
| 52 |
+
"use_l2warp": false,
|
| 53 |
+
"use_output_norm": true,
|
| 54 |
+
"use_short_conv": true,
|
| 55 |
+
"vocab_size": 32000
|
| 56 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_1B.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": null,
|
| 3 |
+
"attn_mode": "chunk",
|
| 4 |
+
"bos_token_id": 1,
|
| 5 |
+
"conv_size": 4,
|
| 6 |
+
"eos_token_id": 2,
|
| 7 |
+
"expand_k": 1,
|
| 8 |
+
"expand_v": 1,
|
| 9 |
+
"fuse_cross_entropy": true,
|
| 10 |
+
"fuse_norm": true,
|
| 11 |
+
"hidden_act": "swish",
|
| 12 |
+
"hidden_ratio": 4,
|
| 13 |
+
"hidden_size": 2048,
|
| 14 |
+
"initializer_range": 0.02,
|
| 15 |
+
"intermediate_size": null,
|
| 16 |
+
"model_type": "delta_net",
|
| 17 |
+
"norm_eps": 1e-06,
|
| 18 |
+
"num_heads": 16,
|
| 19 |
+
"num_hidden_layers": 24,
|
| 20 |
+
"pad_token_id": 2,
|
| 21 |
+
"qk_activation": "silu",
|
| 22 |
+
"qk_norm": "l2",
|
| 23 |
+
"tie_word_embeddings": false,
|
| 24 |
+
"use_beta": true,
|
| 25 |
+
"use_cache": true,
|
| 26 |
+
"use_gate": false,
|
| 27 |
+
"use_output_norm": true,
|
| 28 |
+
"use_short_conv": true
|
| 29 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/delta_net_340M.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 1,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "delta_net",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 8,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"qk_activation": "silu",
|
| 19 |
+
"qk_norm": "l2",
|
| 20 |
+
"tie_word_embeddings": false,
|
| 21 |
+
"use_beta": true,
|
| 22 |
+
"use_cache": true,
|
| 23 |
+
"use_gate": false,
|
| 24 |
+
"use_output_norm": true,
|
| 25 |
+
"use_short_conv": true
|
| 26 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_1B.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_v": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"head_dim": 256,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 2048,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "gated_deltanet",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 6,
|
| 17 |
+
"num_hidden_layers": 21,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"use_gate": true,
|
| 21 |
+
"use_short_conv": true
|
| 22 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gated_deltanet_340M.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"conv_size": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_v": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"head_dim": 256,
|
| 9 |
+
"hidden_act": "swish",
|
| 10 |
+
"hidden_ratio": 4,
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": null,
|
| 14 |
+
"model_type": "gated_deltanet",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 6,
|
| 17 |
+
"num_hidden_layers": 21,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"use_gate": true,
|
| 21 |
+
"use_short_conv": true
|
| 22 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gdn_6_1_340M.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"GatedDeltaNetForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attn": {
|
| 6 |
+
"layers": [
|
| 7 |
+
5,
|
| 8 |
+
11,
|
| 9 |
+
17,
|
| 10 |
+
23
|
| 11 |
+
],
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_kv_heads": 8,
|
| 14 |
+
"qkv_bias": false,
|
| 15 |
+
"rope_theta": 160000.0,
|
| 16 |
+
"window_size": null
|
| 17 |
+
},
|
| 18 |
+
"attn_mode": "chunk",
|
| 19 |
+
"bos_token_id": 1,
|
| 20 |
+
"conv_size": 4,
|
| 21 |
+
"eos_token_id": 2,
|
| 22 |
+
"expand_k": 1,
|
| 23 |
+
"expand_v": 1,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"head_dim": 256,
|
| 28 |
+
"hidden_act": "swish",
|
| 29 |
+
"hidden_ratio": 4,
|
| 30 |
+
"hidden_size": 1024,
|
| 31 |
+
"initializer_range": 0.02,
|
| 32 |
+
"intermediate_size": null,
|
| 33 |
+
"max_position_embeddings": 8192,
|
| 34 |
+
"model_type": "gated_deltanet",
|
| 35 |
+
"norm_eps": 1e-06,
|
| 36 |
+
"norm_first": false,
|
| 37 |
+
"num_heads": 4,
|
| 38 |
+
"num_hidden_layers": 24,
|
| 39 |
+
"qk_activation": "silu",
|
| 40 |
+
"qk_norm": "l2",
|
| 41 |
+
"tie_word_embeddings": false,
|
| 42 |
+
"torch_dtype": "float32",
|
| 43 |
+
"transformers_version": "4.51.3",
|
| 44 |
+
"use_beta": true,
|
| 45 |
+
"use_cache": true,
|
| 46 |
+
"use_gate": true,
|
| 47 |
+
"use_output_norm": true,
|
| 48 |
+
"use_short_conv": true,
|
| 49 |
+
"vocab_size": 32000
|
| 50 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gdn_6_1_340M_bf16.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"GatedDeltaNetForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attn": {
|
| 6 |
+
"layers": [
|
| 7 |
+
5,
|
| 8 |
+
11,
|
| 9 |
+
17,
|
| 10 |
+
23
|
| 11 |
+
],
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_kv_heads": 8,
|
| 14 |
+
"qkv_bias": false,
|
| 15 |
+
"rope_theta": 160000.0,
|
| 16 |
+
"window_size": null
|
| 17 |
+
},
|
| 18 |
+
"attn_mode": "chunk",
|
| 19 |
+
"bos_token_id": 1,
|
| 20 |
+
"conv_size": 4,
|
| 21 |
+
"eos_token_id": 2,
|
| 22 |
+
"expand_k": 1,
|
| 23 |
+
"expand_v": 1,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"head_dim": 256,
|
| 28 |
+
"hidden_act": "swish",
|
| 29 |
+
"hidden_ratio": 4,
|
| 30 |
+
"hidden_size": 1024,
|
| 31 |
+
"initializer_range": 0.02,
|
| 32 |
+
"intermediate_size": null,
|
| 33 |
+
"max_position_embeddings": 8192,
|
| 34 |
+
"model_type": "gated_deltanet",
|
| 35 |
+
"norm_eps": 1e-06,
|
| 36 |
+
"norm_first": false,
|
| 37 |
+
"num_heads": 4,
|
| 38 |
+
"num_hidden_layers": 24,
|
| 39 |
+
"qk_activation": "silu",
|
| 40 |
+
"qk_norm": "l2",
|
| 41 |
+
"tie_word_embeddings": false,
|
| 42 |
+
"torch_dtype": "bfloat16",
|
| 43 |
+
"transformers_version": "4.51.3",
|
| 44 |
+
"use_beta": true,
|
| 45 |
+
"use_cache": true,
|
| 46 |
+
"use_gate": true,
|
| 47 |
+
"use_output_norm": true,
|
| 48 |
+
"use_short_conv": true,
|
| 49 |
+
"vocab_size": 32000
|
| 50 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gdn_6_nsa_1_340M.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"GatedDeltaNetForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attn": {
|
| 6 |
+
"layers": [
|
| 7 |
+
5,
|
| 8 |
+
11,
|
| 9 |
+
17,
|
| 10 |
+
23
|
| 11 |
+
],
|
| 12 |
+
"num_heads": 32,
|
| 13 |
+
"num_kv_heads": 2,
|
| 14 |
+
"qkv_bias": false,
|
| 15 |
+
"rope_theta": 160000.0,
|
| 16 |
+
"type": "nsa",
|
| 17 |
+
"block_size": 64,
|
| 18 |
+
"block_counts": 16,
|
| 19 |
+
"window_size": 512
|
| 20 |
+
},
|
| 21 |
+
"attn_mode": "chunk",
|
| 22 |
+
"bos_token_id": 1,
|
| 23 |
+
"conv_size": 4,
|
| 24 |
+
"eos_token_id": 2,
|
| 25 |
+
"expand_k": 1,
|
| 26 |
+
"expand_v": 1,
|
| 27 |
+
"fuse_cross_entropy": true,
|
| 28 |
+
"fuse_norm": true,
|
| 29 |
+
"fuse_swiglu": true,
|
| 30 |
+
"head_dim": 256,
|
| 31 |
+
"hidden_act": "swish",
|
| 32 |
+
"hidden_ratio": 4,
|
| 33 |
+
"hidden_size": 1024,
|
| 34 |
+
"initializer_range": 0.02,
|
| 35 |
+
"intermediate_size": null,
|
| 36 |
+
"max_position_embeddings": 8192,
|
| 37 |
+
"model_type": "gated_deltanet",
|
| 38 |
+
"norm_eps": 1e-06,
|
| 39 |
+
"norm_first": false,
|
| 40 |
+
"num_heads": 4,
|
| 41 |
+
"num_hidden_layers": 24,
|
| 42 |
+
"qk_activation": "silu",
|
| 43 |
+
"qk_norm": "l2",
|
| 44 |
+
"tie_word_embeddings": false,
|
| 45 |
+
"torch_dtype": "bfloat16",
|
| 46 |
+
"transformers_version": "4.51.3",
|
| 47 |
+
"use_beta": true,
|
| 48 |
+
"use_cache": true,
|
| 49 |
+
"use_gate": true,
|
| 50 |
+
"use_output_norm": true,
|
| 51 |
+
"use_short_conv": true,
|
| 52 |
+
"vocab_size": 32000
|
| 53 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_340M.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"clamp_min": null,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 0.5,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"fuse_norm": true,
|
| 10 |
+
"hidden_act": "swish",
|
| 11 |
+
"hidden_ratio": 4,
|
| 12 |
+
"hidden_size": 1024,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": null,
|
| 15 |
+
"model_type": "gla",
|
| 16 |
+
"num_heads": 4,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"norm_eps": 1e-06,
|
| 19 |
+
"tie_word_embeddings": false,
|
| 20 |
+
"use_cache": true,
|
| 21 |
+
"use_gk": true,
|
| 22 |
+
"use_gv": false,
|
| 23 |
+
"vocab_size": 32000
|
| 24 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gla_7B.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": null,
|
| 3 |
+
"attn_mode": "chunk",
|
| 4 |
+
"bos_token_id": 1,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand_k": 0.5,
|
| 7 |
+
"expand_v": 1,
|
| 8 |
+
"fuse_cross_entropy": true,
|
| 9 |
+
"fuse_norm": true,
|
| 10 |
+
"hidden_act": "swish",
|
| 11 |
+
"hidden_ratio": 4,
|
| 12 |
+
"hidden_size": 4096,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 11008,
|
| 15 |
+
"model_type": "gla",
|
| 16 |
+
"norm_eps": 1e-06,
|
| 17 |
+
"num_heads": 16,
|
| 18 |
+
"num_hidden_layers": 32,
|
| 19 |
+
"tie_word_embeddings": false,
|
| 20 |
+
"use_cache": true,
|
| 21 |
+
"use_gk": true,
|
| 22 |
+
"use_gv": false,
|
| 23 |
+
"use_output_gate": true,
|
| 24 |
+
"use_short_conv": false
|
| 25 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/gsa_340M.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_size": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand_k": 1,
|
| 6 |
+
"expand_v": 1,
|
| 7 |
+
"elementwise_affine": false,
|
| 8 |
+
"feature_map": "swish",
|
| 9 |
+
"fuse_cross_entropy": true,
|
| 10 |
+
"fuse_norm": true,
|
| 11 |
+
"gate_logit_normalizer": 4,
|
| 12 |
+
"hidden_act": "swish",
|
| 13 |
+
"hidden_ratio": 4,
|
| 14 |
+
"hidden_size": 1024,
|
| 15 |
+
"initializer_range": 0.02,
|
| 16 |
+
"intermediate_size": null,
|
| 17 |
+
"model_type": "gsa",
|
| 18 |
+
"num_heads": 4,
|
| 19 |
+
"num_hidden_layers": 24,
|
| 20 |
+
"num_slots": 64,
|
| 21 |
+
"norm_eps": 1e-06,
|
| 22 |
+
"share_conv_kernel": true,
|
| 23 |
+
"tie_word_embeddings": false,
|
| 24 |
+
"use_cache": true,
|
| 25 |
+
"use_norm": true,
|
| 26 |
+
"use_output_gate": true,
|
| 27 |
+
"use_rope": false,
|
| 28 |
+
"use_short_conv": false
|
| 29 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/hgrn2_340M.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn_mode": "chunk",
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand_ratio": 128,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "swish",
|
| 9 |
+
"hidden_ratio": 4,
|
| 10 |
+
"hidden_size": 1024,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": null,
|
| 13 |
+
"model_type": "hgrn2",
|
| 14 |
+
"num_heads": 8,
|
| 15 |
+
"num_hidden_layers": 24,
|
| 16 |
+
"norm_eps": 1e-06,
|
| 17 |
+
"tie_word_embeddings": false,
|
| 18 |
+
"use_cache": true,
|
| 19 |
+
"vocab_size": 32000
|
| 20 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_1B.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"chunk_size": 256,
|
| 4 |
+
"conv_kernel": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"fuse_norm": true,
|
| 9 |
+
"head_dim": 64,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 2048,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"norm_eps": 1e-05,
|
| 14 |
+
"model_type": "mamba2",
|
| 15 |
+
"n_groups": 1,
|
| 16 |
+
"num_hidden_layers": 48,
|
| 17 |
+
"pad_token_id": 0,
|
| 18 |
+
"rescale_prenorm_residual": true,
|
| 19 |
+
"residual_in_fp32": true,
|
| 20 |
+
"rms_norm": true,
|
| 21 |
+
"state_size": 128,
|
| 22 |
+
"tie_word_embeddings": false,
|
| 23 |
+
"time_step_floor": 0.0001,
|
| 24 |
+
"time_step_max": 0.1,
|
| 25 |
+
"time_step_min": 0.001,
|
| 26 |
+
"time_step_rank": 128,
|
| 27 |
+
"transformers_version": "4.50.1",
|
| 28 |
+
"use_bias": false,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"use_conv_bias": true,
|
| 31 |
+
"vocab_size": 32000
|
| 32 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_340M.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"chunk_size": 256,
|
| 4 |
+
"conv_kernel": 4,
|
| 5 |
+
"eos_token_id": 2,
|
| 6 |
+
"expand": 2,
|
| 7 |
+
"fuse_cross_entropy": true,
|
| 8 |
+
"fuse_norm": true,
|
| 9 |
+
"head_dim": 64,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"norm_eps": 1e-05,
|
| 14 |
+
"model_type": "mamba2",
|
| 15 |
+
"n_groups": 1,
|
| 16 |
+
"num_hidden_layers": 48,
|
| 17 |
+
"pad_token_id": 0,
|
| 18 |
+
"rescale_prenorm_residual": true,
|
| 19 |
+
"residual_in_fp32": true,
|
| 20 |
+
"rms_norm": true,
|
| 21 |
+
"state_size": 128,
|
| 22 |
+
"tie_word_embeddings": false,
|
| 23 |
+
"time_step_floor": 0.0001,
|
| 24 |
+
"time_step_max": 0.1,
|
| 25 |
+
"time_step_min": 0.001,
|
| 26 |
+
"time_step_rank": 128,
|
| 27 |
+
"transformers_version": "4.50.1",
|
| 28 |
+
"use_bias": false,
|
| 29 |
+
"use_cache": true,
|
| 30 |
+
"use_conv_bias": true,
|
| 31 |
+
"vocab_size": 32000
|
| 32 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba2_6_1_340M.json
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Mamba2ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attn": {
|
| 6 |
+
"layers": [
|
| 7 |
+
5,
|
| 8 |
+
11,
|
| 9 |
+
17,
|
| 10 |
+
23
|
| 11 |
+
],
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_kv_heads": 8,
|
| 14 |
+
"qkv_bias": false,
|
| 15 |
+
"rope_theta": 160000.0,
|
| 16 |
+
"window_size": null
|
| 17 |
+
},
|
| 18 |
+
"attn_mode": "chunk",
|
| 19 |
+
"bos_token_id": 1,
|
| 20 |
+
"chunk_size": 256,
|
| 21 |
+
"conv_kernel": 4,
|
| 22 |
+
"eos_token_id": 2,
|
| 23 |
+
"expand": 2,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"head_dim": 64,
|
| 28 |
+
"hidden_act": "silu",
|
| 29 |
+
"hidden_size": 1024,
|
| 30 |
+
"initializer_range": 0.02,
|
| 31 |
+
"norm_eps": 1e-05,
|
| 32 |
+
"model_type": "mamba2",
|
| 33 |
+
"n_groups": 1,
|
| 34 |
+
"num_hidden_layers": 48,
|
| 35 |
+
"pad_token_id": 0,
|
| 36 |
+
"rescale_prenorm_residual": true,
|
| 37 |
+
"residual_in_fp32": true,
|
| 38 |
+
"rms_norm": true,
|
| 39 |
+
"state_size": 128,
|
| 40 |
+
"tie_word_embeddings": false,
|
| 41 |
+
"time_step_floor": 0.0001,
|
| 42 |
+
"time_step_max": 0.1,
|
| 43 |
+
"time_step_min": 0.001,
|
| 44 |
+
"time_step_rank": 128,
|
| 45 |
+
"transformers_version": "4.50.1",
|
| 46 |
+
"use_bias": false,
|
| 47 |
+
"use_cache": true,
|
| 48 |
+
"use_conv_bias": true,
|
| 49 |
+
"vocab_size": 32000
|
| 50 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_1B.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_kernel": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand": 2,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 2048,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"model_type": "mamba",
|
| 12 |
+
"norm_eps": 1e-05,
|
| 13 |
+
"num_hidden_layers": 48,
|
| 14 |
+
"pad_token_id": 0,
|
| 15 |
+
"rescale_prenorm_residual": false,
|
| 16 |
+
"residual_in_fp32": false,
|
| 17 |
+
"state_size": 16,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"time_step_floor": 0.0001,
|
| 20 |
+
"time_step_init_scheme": "random",
|
| 21 |
+
"time_step_max": 0.1,
|
| 22 |
+
"time_step_min": 0.001,
|
| 23 |
+
"time_step_rank": 128,
|
| 24 |
+
"time_step_scale": 1.0,
|
| 25 |
+
"transformers_version": "4.50.1",
|
| 26 |
+
"use_bias": false,
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"use_conv_bias": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/mamba_340M.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"conv_kernel": 4,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"expand": 2,
|
| 6 |
+
"fuse_cross_entropy": true,
|
| 7 |
+
"fuse_norm": true,
|
| 8 |
+
"hidden_act": "silu",
|
| 9 |
+
"hidden_size": 1024,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"model_type": "mamba",
|
| 12 |
+
"norm_eps": 1e-05,
|
| 13 |
+
"num_hidden_layers": 48,
|
| 14 |
+
"pad_token_id": 0,
|
| 15 |
+
"rescale_prenorm_residual": false,
|
| 16 |
+
"residual_in_fp32": false,
|
| 17 |
+
"state_size": 16,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"time_step_floor": 0.0001,
|
| 20 |
+
"time_step_init_scheme": "random",
|
| 21 |
+
"time_step_max": 0.1,
|
| 22 |
+
"time_step_min": 0.001,
|
| 23 |
+
"time_step_rank": 128,
|
| 24 |
+
"time_step_scale": 1.0,
|
| 25 |
+
"transformers_version": "4.50.1",
|
| 26 |
+
"use_bias": false,
|
| 27 |
+
"use_cache": true,
|
| 28 |
+
"use_conv_bias": true,
|
| 29 |
+
"vocab_size": 32000
|
| 30 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/samba_1B.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attn": {
|
| 3 |
+
"layers": [
|
| 4 |
+
1,
|
| 5 |
+
3,
|
| 6 |
+
5,
|
| 7 |
+
7,
|
| 8 |
+
9,
|
| 9 |
+
11,
|
| 10 |
+
13,
|
| 11 |
+
15,
|
| 12 |
+
17
|
| 13 |
+
],
|
| 14 |
+
"num_heads": 18,
|
| 15 |
+
"num_kv_heads": 18,
|
| 16 |
+
"qkv_bias": false,
|
| 17 |
+
"rope_theta": 10000.0,
|
| 18 |
+
"window_size": 2048
|
| 19 |
+
},
|
| 20 |
+
"bos_token_id": 1,
|
| 21 |
+
"conv_kernel": 4,
|
| 22 |
+
"eos_token_id": 2,
|
| 23 |
+
"expand": 2,
|
| 24 |
+
"fuse_cross_entropy": true,
|
| 25 |
+
"fuse_norm": true,
|
| 26 |
+
"fuse_swiglu": true,
|
| 27 |
+
"hidden_act": "swish",
|
| 28 |
+
"hidden_ratio": 4,
|
| 29 |
+
"hidden_size": 2304,
|
| 30 |
+
"initializer_range": 0.02,
|
| 31 |
+
"intermediate_size": 4608,
|
| 32 |
+
"max_position_embeddings": 2048,
|
| 33 |
+
"model_type": "samba",
|
| 34 |
+
"norm_eps": 1e-05,
|
| 35 |
+
"num_hidden_layers": 18,
|
| 36 |
+
"pad_token_id": 0,
|
| 37 |
+
"rescale_prenorm_residual": false,
|
| 38 |
+
"residual_in_fp32": false,
|
| 39 |
+
"state_size": 16,
|
| 40 |
+
"tie_word_embeddings": false,
|
| 41 |
+
"time_step_floor": 0.0001,
|
| 42 |
+
"time_step_init_scheme": "random",
|
| 43 |
+
"time_step_max": 0.1,
|
| 44 |
+
"time_step_min": 0.001,
|
| 45 |
+
"time_step_rank": 144,
|
| 46 |
+
"time_step_scale": 1.0,
|
| 47 |
+
"transformers_version": "4.50.1",
|
| 48 |
+
"use_bias": false,
|
| 49 |
+
"use_cache": true,
|
| 50 |
+
"use_conv_bias": true,
|
| 51 |
+
"vocab_size": 32000
|
| 52 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/sba_340m.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_size": 1024,
|
| 9 |
+
"initializer_range": 0.006,
|
| 10 |
+
"max_position_embeddings": 8192,
|
| 11 |
+
"model_type": "sba",
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_hidden_layers": 24,
|
| 14 |
+
"norm_eps": 1e-06,
|
| 15 |
+
"tie_word_embeddings": false,
|
| 16 |
+
"use_cache": true,
|
| 17 |
+
"vocab_size": 32000
|
| 18 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_1B.json
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 1,
|
| 3 |
+
"elementwise_affine": true,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"fuse_swiglu": true,
|
| 8 |
+
"hidden_act": "swish",
|
| 9 |
+
"hidden_ratio": 4,
|
| 10 |
+
"hidden_size": 2048,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": null,
|
| 13 |
+
"max_position_embeddings": 8192,
|
| 14 |
+
"model_type": "transformer",
|
| 15 |
+
"norm_eps": 1e-06,
|
| 16 |
+
"num_heads": 32,
|
| 17 |
+
"num_hidden_layers": 24,
|
| 18 |
+
"num_kv_heads": null,
|
| 19 |
+
"pad_token_id": 2,
|
| 20 |
+
"rope_theta": 10000.0,
|
| 21 |
+
"tie_word_embeddings": false
|
| 22 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_340M.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_size": 1024,
|
| 9 |
+
"initializer_range": 0.02,
|
| 10 |
+
"max_position_embeddings": 8192,
|
| 11 |
+
"model_type": "transformer",
|
| 12 |
+
"num_heads": 16,
|
| 13 |
+
"num_hidden_layers": 24,
|
| 14 |
+
"norm_eps": 1e-06,
|
| 15 |
+
"tie_word_embeddings": false,
|
| 16 |
+
"use_cache": true,
|
| 17 |
+
"vocab_size": 32000
|
| 18 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/configs/transformer_7B.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"attention_bias": false,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"fuse_cross_entropy": true,
|
| 6 |
+
"fuse_norm": true,
|
| 7 |
+
"hidden_act": "swish",
|
| 8 |
+
"hidden_ratio": 4,
|
| 9 |
+
"hidden_size": 4096,
|
| 10 |
+
"initializer_range": 0.02,
|
| 11 |
+
"intermediate_size": 14336,
|
| 12 |
+
"model_type": "transformer",
|
| 13 |
+
"norm_eps": 1e-06,
|
| 14 |
+
"num_heads": 32,
|
| 15 |
+
"num_hidden_layers": 32,
|
| 16 |
+
"num_kv_heads": 8,
|
| 17 |
+
"rope_theta": 10000.0,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"use_cache": true,
|
| 20 |
+
"window_size": null
|
| 21 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/generation_config.json
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 1,
|
| 4 |
+
"eos_token_id": 2,
|
| 5 |
+
"transformers_version": "4.53.3"
|
| 6 |
+
}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/0/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 488, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 424, in forward\n outputs = self.model(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 294, in forward\n hidden_states, attentions, past_key_values = layer(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 108, in forward\n hidden_states = self.attn_norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 109, in torch_dynamo_resume_in_forward_at_108\n hidden_states, attentions, past_key_values = self.attn(\n ^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 108, in forward\n q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 123, in torch_dynamo_resume_in_forward_at_108\n o = parallel_nsa(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 838, in parallel_nsa\n o_cmp, lse_cmp = parallel_nsa_compression(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838\n o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1432, in __call__\n return self._torchdynamo_orig_callable(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1213, in __call__\n result = self._inner_convert(\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 598, in __call__\n return _compile(\n ^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1110, in _compile\n raise InternalTorchDynamoError(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1059, in _compile\n guarded_code = compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py\", line 97, in wrapper_function\n return function(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 761, in compile_inner\n return _compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 797, in _compile_inner\n out_code = transform_code_object(code, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py\", line 1422, in transform_code_object\n transformations(instructions, code_options)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 257, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 715, in transform\n tracer.run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 3498, in run\n super().run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1337, in run\n while self.step():\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1246, in step\n self.dispatch_table[inst.opcode](self, inst)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 2157, in COMPARE_OP\n self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 789, in <lambda>\n return lambda tx, args, kwargs: obj.call_function(\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 945, in builtin_dispatch\n rv = fn(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 839, in call_binop_handlers\n rv = fn(tx, *args)\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 533, in compare_by_value\n return ConstantVariable(op(a.value, b.value))\n ^^^^^^^^^^^^^^^^^^^^\ntorch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n\n", "timestamp": "1753352474"}}}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/0/stderr.log
ADDED
|
@@ -0,0 +1,573 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-24 18:19:03,237 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-24 18:19:03,237 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "bfloat16",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 8,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 2,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-24 18:19:03,238 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-24 18:19:03,238 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-24 18:19:03,244 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-24 18:19:03,587 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-24 18:19:03,587 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-24 18:19:03,588 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-24 18:19:04,524 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-24 18:19:04,641 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-24 18:19:04,642 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
|
| 146 |
+
[titan] 2025-07-24 18:19:04,833 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 147 |
+
IterableDataset({
|
| 148 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 149 |
+
num_shards: 140
|
| 150 |
+
})
|
| 151 |
+
[titan] 2025-07-24 18:19:04,833 - root - INFO - Shuffling the dataset with seed 42
|
| 152 |
+
[titan] 2025-07-24 18:19:04,833 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 153 |
+
[titan] 2025-07-24 18:19:55,836 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
|
| 154 |
+
IterableDataset({
|
| 155 |
+
features: ['repo', 'content'],
|
| 156 |
+
num_shards: 1
|
| 157 |
+
})
|
| 158 |
+
[titan] 2025-07-24 18:19:55,836 - root - INFO - Shuffling the dataset with seed 42
|
| 159 |
+
[titan] 2025-07-24 18:19:55,836 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 160 |
+
[titan] 2025-07-24 18:19:56,175 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
|
| 161 |
+
IterableDataset({
|
| 162 |
+
features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
|
| 163 |
+
num_shards: 100
|
| 164 |
+
})
|
| 165 |
+
[titan] 2025-07-24 18:19:56,175 - root - INFO - Shuffling the dataset with seed 42
|
| 166 |
+
[titan] 2025-07-24 18:19:56,175 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 167 |
+
[titan] 2025-07-24 18:20:02,301 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
|
| 168 |
+
[titan] 2025-07-24 18:20:02,989 - root - INFO - IterableDataset({
|
| 169 |
+
features: ['text', 'content'],
|
| 170 |
+
num_shards: 256
|
| 171 |
+
})
|
| 172 |
+
[titan] 2025-07-24 18:20:03,108 - root - INFO - Building dataloader...
|
| 173 |
+
[titan] 2025-07-24 18:20:03,110 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json
|
| 174 |
+
[titan] 2025-07-24 18:20:03,112 - root - INFO - Building model from the config
|
| 175 |
+
[32mGatedDeltaNetConfig {
|
| 176 |
+
"allow_neg_eigval": false,
|
| 177 |
+
"architectures": [
|
| 178 |
+
"GatedDeltaNetForCausalLM"
|
| 179 |
+
],
|
| 180 |
+
"attn": {
|
| 181 |
+
"block_counts": 16,
|
| 182 |
+
"block_size": 64,
|
| 183 |
+
"layers": [
|
| 184 |
+
5,
|
| 185 |
+
11,
|
| 186 |
+
17,
|
| 187 |
+
23
|
| 188 |
+
],
|
| 189 |
+
"num_heads": 32,
|
| 190 |
+
"num_kv_heads": 2,
|
| 191 |
+
"qkv_bias": false,
|
| 192 |
+
"rope_theta": 160000.0,
|
| 193 |
+
"type": "nsa",
|
| 194 |
+
"window_size": null
|
| 195 |
+
},
|
| 196 |
+
"attn_mode": "chunk",
|
| 197 |
+
"bos_token_id": 1,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"eos_token_id": 2,
|
| 200 |
+
"expand_k": 1,
|
| 201 |
+
"expand_v": 1,
|
| 202 |
+
"fuse_cross_entropy": true,
|
| 203 |
+
"fuse_norm": true,
|
| 204 |
+
"fuse_swiglu": true,
|
| 205 |
+
"head_dim": 256,
|
| 206 |
+
"hidden_act": "swish",
|
| 207 |
+
"hidden_ratio": 4,
|
| 208 |
+
"hidden_size": 1024,
|
| 209 |
+
"initializer_range": 0.02,
|
| 210 |
+
"intermediate_size": null,
|
| 211 |
+
"max_position_embeddings": 8192,
|
| 212 |
+
"model_type": "gated_deltanet",
|
| 213 |
+
"norm_eps": 1e-06,
|
| 214 |
+
"norm_first": false,
|
| 215 |
+
"num_heads": 4,
|
| 216 |
+
"num_hidden_layers": 24,
|
| 217 |
+
"num_v_heads": null,
|
| 218 |
+
"qk_activation": "silu",
|
| 219 |
+
"qk_norm": "l2",
|
| 220 |
+
"tie_word_embeddings": false,
|
| 221 |
+
"torch_dtype": "bfloat16",
|
| 222 |
+
"transformers_version": "4.53.3",
|
| 223 |
+
"use_beta": true,
|
| 224 |
+
"use_cache": true,
|
| 225 |
+
"use_gate": true,
|
| 226 |
+
"use_l2warp": false,
|
| 227 |
+
"use_output_norm": true,
|
| 228 |
+
"use_short_conv": true,
|
| 229 |
+
"vocab_size": 32000
|
| 230 |
+
}
|
| 231 |
+
[39m
|
| 232 |
+
[titan] 2025-07-24 18:20:03,455 - root - INFO - [34m
|
| 233 |
+
GatedDeltaNetForCausalLM(
|
| 234 |
+
(model): GatedDeltaNetModel(
|
| 235 |
+
(embeddings): Embedding(32000, 1024)
|
| 236 |
+
(layers): ModuleList(
|
| 237 |
+
(0-4): 5 x GatedDeltaNetBlock(
|
| 238 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 239 |
+
(attn): GatedDeltaNet(
|
| 240 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 241 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 242 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 243 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 244 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 245 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 246 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 247 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 248 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 249 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 250 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 251 |
+
)
|
| 252 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 253 |
+
(mlp): GatedMLP(
|
| 254 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 255 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 256 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 257 |
+
(swiglu_linear): SwiGLULinear()
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(5): GatedDeltaNetBlock(
|
| 261 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 262 |
+
(attn): NativeSparseAttention(
|
| 263 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 264 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 265 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 266 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 267 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 268 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 269 |
+
)
|
| 270 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 271 |
+
(mlp): GatedMLP(
|
| 272 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 273 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 274 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 275 |
+
(swiglu_linear): SwiGLULinear()
|
| 276 |
+
)
|
| 277 |
+
)
|
| 278 |
+
(6-10): 5 x GatedDeltaNetBlock(
|
| 279 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 280 |
+
(attn): GatedDeltaNet(
|
| 281 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 282 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 283 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 284 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 285 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 286 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 287 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 288 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 289 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 290 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 291 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 292 |
+
)
|
| 293 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 294 |
+
(mlp): GatedMLP(
|
| 295 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 296 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 297 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 298 |
+
(swiglu_linear): SwiGLULinear()
|
| 299 |
+
)
|
| 300 |
+
)
|
| 301 |
+
(11): GatedDeltaNetBlock(
|
| 302 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 303 |
+
(attn): NativeSparseAttention(
|
| 304 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 305 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 306 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 307 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 308 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 309 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 310 |
+
)
|
| 311 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 312 |
+
(mlp): GatedMLP(
|
| 313 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 314 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 315 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 316 |
+
(swiglu_linear): SwiGLULinear()
|
| 317 |
+
)
|
| 318 |
+
)
|
| 319 |
+
(12-16): 5 x GatedDeltaNetBlock(
|
| 320 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 321 |
+
(attn): GatedDeltaNet(
|
| 322 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 323 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 324 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 325 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 326 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 327 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 328 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 329 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 330 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 331 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 332 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 333 |
+
)
|
| 334 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 335 |
+
(mlp): GatedMLP(
|
| 336 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 337 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 338 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 339 |
+
(swiglu_linear): SwiGLULinear()
|
| 340 |
+
)
|
| 341 |
+
)
|
| 342 |
+
(17): GatedDeltaNetBlock(
|
| 343 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 344 |
+
(attn): NativeSparseAttention(
|
| 345 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 346 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 347 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 348 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 349 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 350 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 351 |
+
)
|
| 352 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 353 |
+
(mlp): GatedMLP(
|
| 354 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 355 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 356 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 357 |
+
(swiglu_linear): SwiGLULinear()
|
| 358 |
+
)
|
| 359 |
+
)
|
| 360 |
+
(18-22): 5 x GatedDeltaNetBlock(
|
| 361 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 362 |
+
(attn): GatedDeltaNet(
|
| 363 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 364 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 365 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 366 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 367 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 368 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 369 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 370 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 371 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 372 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 373 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 374 |
+
)
|
| 375 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 376 |
+
(mlp): GatedMLP(
|
| 377 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 378 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 379 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 380 |
+
(swiglu_linear): SwiGLULinear()
|
| 381 |
+
)
|
| 382 |
+
)
|
| 383 |
+
(23): GatedDeltaNetBlock(
|
| 384 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 385 |
+
(attn): NativeSparseAttention(
|
| 386 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 387 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 388 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 389 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 390 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 391 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 392 |
+
)
|
| 393 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 394 |
+
(mlp): GatedMLP(
|
| 395 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 396 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 397 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 398 |
+
(swiglu_linear): SwiGLULinear()
|
| 399 |
+
)
|
| 400 |
+
)
|
| 401 |
+
)
|
| 402 |
+
(norm): RMSNorm(1024, eps=1e-06)
|
| 403 |
+
)
|
| 404 |
+
(lm_head): Linear(in_features=1024, out_features=32000, bias=False)
|
| 405 |
+
(criterion): FusedLinearCrossEntropyLoss()
|
| 406 |
+
)[39m
|
| 407 |
+
|
| 408 |
+
[titan] 2025-07-24 18:20:03,490 - root - INFO - Compiling each block with torch.compile
|
| 409 |
+
[titan] 2025-07-24 18:20:03,491 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 410 |
+
[titan] 2025-07-24 18:20:03,491 - root - INFO - Compiling the entire model with torch.compile
|
| 411 |
+
[titan] 2025-07-24 18:20:03,573 - root - INFO - Applied FSDP to the model
|
| 412 |
+
[titan] 2025-07-24 18:20:04,119 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `A_log` is a DTensor, skipping initialization
|
| 413 |
+
[titan] 2025-07-24 18:20:04,120 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `dt_bias` is a DTensor, skipping initialization
|
| 414 |
+
[titan] 2025-07-24 18:20:04,215 - root - INFO - CUDA memory usage for model: 0.10GiB(0.10%)
|
| 415 |
+
[titan] 2025-07-24 18:20:04,217 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
|
| 416 |
+
[titan] 2025-07-24 18:20:04,240 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
|
| 417 |
+
[titan] 2025-07-24 18:20:05,365 - root - ERROR - Failed to create WandB logger: api_key not configured (no-tty). call wandb.login(key=[your_api_key])
|
| 418 |
+
[titan] 2025-07-24 18:20:05,376 - root - INFO - TensorBoard logging enabled. Logs will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/tb/20250724-1820
|
| 419 |
+
[titan] 2025-07-24 18:20:05,376 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 420 |
+
[titan] 2025-07-24 18:20:05,441 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 421 |
+
[titan] 2025-07-24 18:20:11,953 - root - INFO - [31m***** Running training *****[39m
|
| 422 |
+
[titan] 2025-07-24 18:20:11,954 - root - INFO - [32m Training starts at step 1
|
| 423 |
+
[titan] 2025-07-24 18:20:11,956 - root - INFO - [32m Number of tokens per sequence = 8,192
|
| 424 |
+
[titan] 2025-07-24 18:20:11,956 - root - INFO - [32m Gradient Accumulation steps = 2
|
| 425 |
+
[titan] 2025-07-24 18:20:11,957 - root - INFO - [32m Instantaneous batch size (per device) = 8
|
| 426 |
+
[titan] 2025-07-24 18:20:11,957 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
|
| 427 |
+
[titan] 2025-07-24 18:20:11,957 - root - INFO - [32m Total optimization steps = 95,366 (99,998,498,816 tokens)
|
| 428 |
+
[titan] 2025-07-24 18:20:11,957 - root - INFO - [32m Warmup steps = 100 (104,857,600 tokens)
|
| 429 |
+
[titan] 2025-07-24 18:20:11,957 - root - INFO - [32m Number of parameters = 396,695,712 [39m
|
| 430 |
+
[titan] 2025-07-24 18:20:11,957 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
|
| 431 |
+
/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 432 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 433 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 434 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 435 |
+
[rank0]: Traceback (most recent call last):
|
| 436 |
+
[rank0]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 437 |
+
[rank0]: File "<frozen runpy>", line 88, in _run_code
|
| 438 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 616, in <module>
|
| 439 |
+
[rank0]: main(config)
|
| 440 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 441 |
+
[rank0]: return f(*args, **kwargs)
|
| 442 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^
|
| 443 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 488, in main
|
| 444 |
+
[rank0]: output = model(
|
| 445 |
+
[rank0]: ^^^^^^
|
| 446 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 447 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 448 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 449 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 450 |
+
[rank0]: return inner()
|
| 451 |
+
[rank0]: ^^^^^^^
|
| 452 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 453 |
+
[rank0]: result = forward_call(*args, **kwargs)
|
| 454 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 455 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
|
| 456 |
+
[rank0]: return func(*args, **kwargs)
|
| 457 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^
|
| 458 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 424, in forward
|
| 459 |
+
[rank0]: outputs = self.model(
|
| 460 |
+
[rank0]: ^^^^^^^^^^^
|
| 461 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 462 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 463 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 464 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 465 |
+
[rank0]: return forward_call(*args, **kwargs)
|
| 466 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 467 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 294, in forward
|
| 468 |
+
[rank0]: hidden_states, attentions, past_key_values = layer(
|
| 469 |
+
[rank0]: ^^^^^^
|
| 470 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 471 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 472 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 473 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 474 |
+
[rank0]: return inner()
|
| 475 |
+
[rank0]: ^^^^^^^
|
| 476 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 477 |
+
[rank0]: result = forward_call(*args, **kwargs)
|
| 478 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 479 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
|
| 480 |
+
[rank0]: return fn(*args, **kwargs)
|
| 481 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^
|
| 482 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 483 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 484 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 485 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 486 |
+
[rank0]: return forward_call(*args, **kwargs)
|
| 487 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 488 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 108, in forward
|
| 489 |
+
[rank0]: hidden_states = self.attn_norm(hidden_states)
|
| 490 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 109, in torch_dynamo_resume_in_forward_at_108
|
| 491 |
+
[rank0]: hidden_states, attentions, past_key_values = self.attn(
|
| 492 |
+
[rank0]: ^^^^^^^^^^
|
| 493 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 494 |
+
[rank0]: return self._call_impl(*args, **kwargs)
|
| 495 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 496 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 497 |
+
[rank0]: return forward_call(*args, **kwargs)
|
| 498 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 499 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 108, in forward
|
| 500 |
+
[rank0]: q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)
|
| 501 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 123, in torch_dynamo_resume_in_forward_at_108
|
| 502 |
+
[rank0]: o = parallel_nsa(
|
| 503 |
+
[rank0]: ^^^^^^^^^^^^^
|
| 504 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 838, in parallel_nsa
|
| 505 |
+
[rank0]: o_cmp, lse_cmp = parallel_nsa_compression(
|
| 506 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838
|
| 507 |
+
[rank0]: o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)
|
| 508 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1432, in __call__
|
| 509 |
+
[rank0]: return self._torchdynamo_orig_callable(
|
| 510 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 511 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1213, in __call__
|
| 512 |
+
[rank0]: result = self._inner_convert(
|
| 513 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^
|
| 514 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 598, in __call__
|
| 515 |
+
[rank0]: return _compile(
|
| 516 |
+
[rank0]: ^^^^^^^^^
|
| 517 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1110, in _compile
|
| 518 |
+
[rank0]: raise InternalTorchDynamoError(
|
| 519 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1059, in _compile
|
| 520 |
+
[rank0]: guarded_code = compile_inner(code, one_graph, hooks, transform)
|
| 521 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 522 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py", line 97, in wrapper_function
|
| 523 |
+
[rank0]: return function(*args, **kwargs)
|
| 524 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 525 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 761, in compile_inner
|
| 526 |
+
[rank0]: return _compile_inner(code, one_graph, hooks, transform)
|
| 527 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 528 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 797, in _compile_inner
|
| 529 |
+
[rank0]: out_code = transform_code_object(code, transform)
|
| 530 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 531 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py", line 1422, in transform_code_object
|
| 532 |
+
[rank0]: transformations(instructions, code_options)
|
| 533 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 257, in _fn
|
| 534 |
+
[rank0]: return fn(*args, **kwargs)
|
| 535 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^
|
| 536 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 715, in transform
|
| 537 |
+
[rank0]: tracer.run()
|
| 538 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 3498, in run
|
| 539 |
+
[rank0]: super().run()
|
| 540 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1337, in run
|
| 541 |
+
[rank0]: while self.step():
|
| 542 |
+
[rank0]: ^^^^^^^^^^^
|
| 543 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1246, in step
|
| 544 |
+
[rank0]: self.dispatch_table[inst.opcode](self, inst)
|
| 545 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 2157, in COMPARE_OP
|
| 546 |
+
[rank0]: self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))
|
| 547 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 548 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 549 |
+
[rank0]: return handler(tx, args, kwargs)
|
| 550 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 551 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 789, in <lambda>
|
| 552 |
+
[rank0]: return lambda tx, args, kwargs: obj.call_function(
|
| 553 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^
|
| 554 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 555 |
+
[rank0]: return handler(tx, args, kwargs)
|
| 556 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 557 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 945, in builtin_dispatch
|
| 558 |
+
[rank0]: rv = fn(tx, args, kwargs)
|
| 559 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^
|
| 560 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 839, in call_binop_handlers
|
| 561 |
+
[rank0]: rv = fn(tx, *args)
|
| 562 |
+
[rank0]: ^^^^^^^^^^^^^
|
| 563 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 533, in compare_by_value
|
| 564 |
+
[rank0]: return ConstantVariable(op(a.value, b.value))
|
| 565 |
+
[rank0]: ^^^^^^^^^^^^^^^^^^^^
|
| 566 |
+
[rank0]: torch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'
|
| 567 |
+
|
| 568 |
+
[rank0]: from user code:
|
| 569 |
+
[rank0]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857
|
| 570 |
+
[rank0]: if window_size > 0:
|
| 571 |
+
|
| 572 |
+
[rank0]: Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
|
| 573 |
+
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/0/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/1/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 488, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 424, in forward\n outputs = self.model(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 294, in forward\n hidden_states, attentions, past_key_values = layer(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 108, in forward\n hidden_states = self.attn_norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 109, in torch_dynamo_resume_in_forward_at_108\n hidden_states, attentions, past_key_values = self.attn(\n ^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 108, in forward\n q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 123, in torch_dynamo_resume_in_forward_at_108\n o = parallel_nsa(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 838, in parallel_nsa\n o_cmp, lse_cmp = parallel_nsa_compression(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838\n o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1432, in __call__\n return self._torchdynamo_orig_callable(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1213, in __call__\n result = self._inner_convert(\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 598, in __call__\n return _compile(\n ^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1110, in _compile\n raise InternalTorchDynamoError(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1059, in _compile\n guarded_code = compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py\", line 97, in wrapper_function\n return function(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 761, in compile_inner\n return _compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 797, in _compile_inner\n out_code = transform_code_object(code, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py\", line 1422, in transform_code_object\n transformations(instructions, code_options)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 257, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 715, in transform\n tracer.run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 3498, in run\n super().run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1337, in run\n while self.step():\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1246, in step\n self.dispatch_table[inst.opcode](self, inst)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 2157, in COMPARE_OP\n self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 789, in <lambda>\n return lambda tx, args, kwargs: obj.call_function(\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 945, in builtin_dispatch\n rv = fn(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 839, in call_binop_handlers\n rv = fn(tx, *args)\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 533, in compare_by_value\n return ConstantVariable(op(a.value, b.value))\n ^^^^^^^^^^^^^^^^^^^^\ntorch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n\n", "timestamp": "1753352474"}}}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/1/stderr.log
ADDED
|
@@ -0,0 +1,571 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-24 18:19:06,932 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-24 18:19:06,933 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "bfloat16",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 8,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 2,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-24 18:19:06,933 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-24 18:19:07,405 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-24 18:19:07,407 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-24 18:19:07,464 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-24 18:19:07,465 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-24 18:19:07,465 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-24 18:19:07,519 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-24 18:19:07,629 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-24 18:19:07,629 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
|
| 146 |
+
[titan] 2025-07-24 18:19:07,764 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 147 |
+
IterableDataset({
|
| 148 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 149 |
+
num_shards: 140
|
| 150 |
+
})
|
| 151 |
+
[titan] 2025-07-24 18:19:07,764 - root - INFO - Shuffling the dataset with seed 42
|
| 152 |
+
[titan] 2025-07-24 18:19:07,764 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 153 |
+
[titan] 2025-07-24 18:20:12,516 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
|
| 154 |
+
IterableDataset({
|
| 155 |
+
features: ['repo', 'content'],
|
| 156 |
+
num_shards: 1
|
| 157 |
+
})
|
| 158 |
+
[titan] 2025-07-24 18:20:12,516 - root - INFO - Shuffling the dataset with seed 42
|
| 159 |
+
[titan] 2025-07-24 18:20:12,516 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 160 |
+
[titan] 2025-07-24 18:20:12,876 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
|
| 161 |
+
IterableDataset({
|
| 162 |
+
features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
|
| 163 |
+
num_shards: 100
|
| 164 |
+
})
|
| 165 |
+
[titan] 2025-07-24 18:20:12,877 - root - INFO - Shuffling the dataset with seed 42
|
| 166 |
+
[titan] 2025-07-24 18:20:12,877 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 167 |
+
[titan] 2025-07-24 18:20:19,318 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
|
| 168 |
+
[titan] 2025-07-24 18:20:20,040 - root - INFO - IterableDataset({
|
| 169 |
+
features: ['text', 'content'],
|
| 170 |
+
num_shards: 256
|
| 171 |
+
})
|
| 172 |
+
[titan] 2025-07-24 18:20:20,162 - root - INFO - Building dataloader...
|
| 173 |
+
[titan] 2025-07-24 18:20:20,164 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json
|
| 174 |
+
[titan] 2025-07-24 18:20:20,166 - root - INFO - Building model from the config
|
| 175 |
+
[32mGatedDeltaNetConfig {
|
| 176 |
+
"allow_neg_eigval": false,
|
| 177 |
+
"architectures": [
|
| 178 |
+
"GatedDeltaNetForCausalLM"
|
| 179 |
+
],
|
| 180 |
+
"attn": {
|
| 181 |
+
"block_counts": 16,
|
| 182 |
+
"block_size": 64,
|
| 183 |
+
"layers": [
|
| 184 |
+
5,
|
| 185 |
+
11,
|
| 186 |
+
17,
|
| 187 |
+
23
|
| 188 |
+
],
|
| 189 |
+
"num_heads": 32,
|
| 190 |
+
"num_kv_heads": 2,
|
| 191 |
+
"qkv_bias": false,
|
| 192 |
+
"rope_theta": 160000.0,
|
| 193 |
+
"type": "nsa",
|
| 194 |
+
"window_size": null
|
| 195 |
+
},
|
| 196 |
+
"attn_mode": "chunk",
|
| 197 |
+
"bos_token_id": 1,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"eos_token_id": 2,
|
| 200 |
+
"expand_k": 1,
|
| 201 |
+
"expand_v": 1,
|
| 202 |
+
"fuse_cross_entropy": true,
|
| 203 |
+
"fuse_norm": true,
|
| 204 |
+
"fuse_swiglu": true,
|
| 205 |
+
"head_dim": 256,
|
| 206 |
+
"hidden_act": "swish",
|
| 207 |
+
"hidden_ratio": 4,
|
| 208 |
+
"hidden_size": 1024,
|
| 209 |
+
"initializer_range": 0.02,
|
| 210 |
+
"intermediate_size": null,
|
| 211 |
+
"max_position_embeddings": 8192,
|
| 212 |
+
"model_type": "gated_deltanet",
|
| 213 |
+
"norm_eps": 1e-06,
|
| 214 |
+
"norm_first": false,
|
| 215 |
+
"num_heads": 4,
|
| 216 |
+
"num_hidden_layers": 24,
|
| 217 |
+
"num_v_heads": null,
|
| 218 |
+
"qk_activation": "silu",
|
| 219 |
+
"qk_norm": "l2",
|
| 220 |
+
"tie_word_embeddings": false,
|
| 221 |
+
"torch_dtype": "bfloat16",
|
| 222 |
+
"transformers_version": "4.53.3",
|
| 223 |
+
"use_beta": true,
|
| 224 |
+
"use_cache": true,
|
| 225 |
+
"use_gate": true,
|
| 226 |
+
"use_l2warp": false,
|
| 227 |
+
"use_output_norm": true,
|
| 228 |
+
"use_short_conv": true,
|
| 229 |
+
"vocab_size": 32000
|
| 230 |
+
}
|
| 231 |
+
[39m
|
| 232 |
+
[titan] 2025-07-24 18:20:20,496 - root - INFO - [34m
|
| 233 |
+
GatedDeltaNetForCausalLM(
|
| 234 |
+
(model): GatedDeltaNetModel(
|
| 235 |
+
(embeddings): Embedding(32000, 1024)
|
| 236 |
+
(layers): ModuleList(
|
| 237 |
+
(0-4): 5 x GatedDeltaNetBlock(
|
| 238 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 239 |
+
(attn): GatedDeltaNet(
|
| 240 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 241 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 242 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 243 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 244 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 245 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 246 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 247 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 248 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 249 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 250 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 251 |
+
)
|
| 252 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 253 |
+
(mlp): GatedMLP(
|
| 254 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 255 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 256 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 257 |
+
(swiglu_linear): SwiGLULinear()
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(5): GatedDeltaNetBlock(
|
| 261 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 262 |
+
(attn): NativeSparseAttention(
|
| 263 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 264 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 265 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 266 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 267 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 268 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 269 |
+
)
|
| 270 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 271 |
+
(mlp): GatedMLP(
|
| 272 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 273 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 274 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 275 |
+
(swiglu_linear): SwiGLULinear()
|
| 276 |
+
)
|
| 277 |
+
)
|
| 278 |
+
(6-10): 5 x GatedDeltaNetBlock(
|
| 279 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 280 |
+
(attn): GatedDeltaNet(
|
| 281 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 282 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 283 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 284 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 285 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 286 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 287 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 288 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 289 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 290 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 291 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 292 |
+
)
|
| 293 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 294 |
+
(mlp): GatedMLP(
|
| 295 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 296 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 297 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 298 |
+
(swiglu_linear): SwiGLULinear()
|
| 299 |
+
)
|
| 300 |
+
)
|
| 301 |
+
(11): GatedDeltaNetBlock(
|
| 302 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 303 |
+
(attn): NativeSparseAttention(
|
| 304 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 305 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 306 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 307 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 308 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 309 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 310 |
+
)
|
| 311 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 312 |
+
(mlp): GatedMLP(
|
| 313 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 314 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 315 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 316 |
+
(swiglu_linear): SwiGLULinear()
|
| 317 |
+
)
|
| 318 |
+
)
|
| 319 |
+
(12-16): 5 x GatedDeltaNetBlock(
|
| 320 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 321 |
+
(attn): GatedDeltaNet(
|
| 322 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 323 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 324 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 325 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 326 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 327 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 328 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 329 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 330 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 331 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 332 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 333 |
+
)
|
| 334 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 335 |
+
(mlp): GatedMLP(
|
| 336 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 337 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 338 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 339 |
+
(swiglu_linear): SwiGLULinear()
|
| 340 |
+
)
|
| 341 |
+
)
|
| 342 |
+
(17): GatedDeltaNetBlock(
|
| 343 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 344 |
+
(attn): NativeSparseAttention(
|
| 345 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 346 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 347 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 348 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 349 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 350 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 351 |
+
)
|
| 352 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 353 |
+
(mlp): GatedMLP(
|
| 354 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 355 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 356 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 357 |
+
(swiglu_linear): SwiGLULinear()
|
| 358 |
+
)
|
| 359 |
+
)
|
| 360 |
+
(18-22): 5 x GatedDeltaNetBlock(
|
| 361 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 362 |
+
(attn): GatedDeltaNet(
|
| 363 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 364 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 365 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 366 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 367 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 368 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 369 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 370 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 371 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 372 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 373 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 374 |
+
)
|
| 375 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 376 |
+
(mlp): GatedMLP(
|
| 377 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 378 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 379 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 380 |
+
(swiglu_linear): SwiGLULinear()
|
| 381 |
+
)
|
| 382 |
+
)
|
| 383 |
+
(23): GatedDeltaNetBlock(
|
| 384 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 385 |
+
(attn): NativeSparseAttention(
|
| 386 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 387 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 388 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 389 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 390 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 391 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 392 |
+
)
|
| 393 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 394 |
+
(mlp): GatedMLP(
|
| 395 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 396 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 397 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 398 |
+
(swiglu_linear): SwiGLULinear()
|
| 399 |
+
)
|
| 400 |
+
)
|
| 401 |
+
)
|
| 402 |
+
(norm): RMSNorm(1024, eps=1e-06)
|
| 403 |
+
)
|
| 404 |
+
(lm_head): Linear(in_features=1024, out_features=32000, bias=False)
|
| 405 |
+
(criterion): FusedLinearCrossEntropyLoss()
|
| 406 |
+
)[39m
|
| 407 |
+
|
| 408 |
+
[titan] 2025-07-24 18:20:20,532 - root - INFO - Compiling each block with torch.compile
|
| 409 |
+
[titan] 2025-07-24 18:20:20,532 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 410 |
+
[titan] 2025-07-24 18:20:20,533 - root - INFO - Compiling the entire model with torch.compile
|
| 411 |
+
[titan] 2025-07-24 18:20:20,617 - root - INFO - Applied FSDP to the model
|
| 412 |
+
[titan] 2025-07-24 18:20:20,680 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `A_log` is a DTensor, skipping initialization
|
| 413 |
+
[titan] 2025-07-24 18:20:20,680 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `dt_bias` is a DTensor, skipping initialization
|
| 414 |
+
[titan] 2025-07-24 18:20:20,776 - root - INFO - CUDA memory usage for model: 0.10GiB(0.10%)
|
| 415 |
+
[titan] 2025-07-24 18:20:20,778 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
|
| 416 |
+
[titan] 2025-07-24 18:20:20,802 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
|
| 417 |
+
[titan] 2025-07-24 18:20:20,803 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 418 |
+
[titan] 2025-07-24 18:20:20,860 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 419 |
+
[titan] 2025-07-24 18:20:29,317 - root - INFO - [31m***** Running training *****[39m
|
| 420 |
+
[titan] 2025-07-24 18:20:29,347 - root - INFO - [32m Training starts at step 1
|
| 421 |
+
[titan] 2025-07-24 18:20:29,349 - root - INFO - [32m Number of tokens per sequence = 8,192
|
| 422 |
+
[titan] 2025-07-24 18:20:29,350 - root - INFO - [32m Gradient Accumulation steps = 2
|
| 423 |
+
[titan] 2025-07-24 18:20:29,350 - root - INFO - [32m Instantaneous batch size (per device) = 8
|
| 424 |
+
[titan] 2025-07-24 18:20:29,350 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
|
| 425 |
+
[titan] 2025-07-24 18:20:29,350 - root - INFO - [32m Total optimization steps = 95,366 (99,998,498,816 tokens)
|
| 426 |
+
[titan] 2025-07-24 18:20:29,350 - root - INFO - [32m Warmup steps = 100 (104,857,600 tokens)
|
| 427 |
+
[titan] 2025-07-24 18:20:29,350 - root - INFO - [32m Number of parameters = 396,695,712 [39m
|
| 428 |
+
[titan] 2025-07-24 18:20:29,351 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
|
| 429 |
+
/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 430 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 431 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 432 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 433 |
+
[rank1]: Traceback (most recent call last):
|
| 434 |
+
[rank1]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 435 |
+
[rank1]: File "<frozen runpy>", line 88, in _run_code
|
| 436 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 616, in <module>
|
| 437 |
+
[rank1]: main(config)
|
| 438 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 439 |
+
[rank1]: return f(*args, **kwargs)
|
| 440 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^
|
| 441 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 488, in main
|
| 442 |
+
[rank1]: output = model(
|
| 443 |
+
[rank1]: ^^^^^^
|
| 444 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 445 |
+
[rank1]: return self._call_impl(*args, **kwargs)
|
| 446 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 447 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 448 |
+
[rank1]: return inner()
|
| 449 |
+
[rank1]: ^^^^^^^
|
| 450 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 451 |
+
[rank1]: result = forward_call(*args, **kwargs)
|
| 452 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 453 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
|
| 454 |
+
[rank1]: return func(*args, **kwargs)
|
| 455 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^
|
| 456 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 424, in forward
|
| 457 |
+
[rank1]: outputs = self.model(
|
| 458 |
+
[rank1]: ^^^^^^^^^^^
|
| 459 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 460 |
+
[rank1]: return self._call_impl(*args, **kwargs)
|
| 461 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 462 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 463 |
+
[rank1]: return forward_call(*args, **kwargs)
|
| 464 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 465 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 294, in forward
|
| 466 |
+
[rank1]: hidden_states, attentions, past_key_values = layer(
|
| 467 |
+
[rank1]: ^^^^^^
|
| 468 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 469 |
+
[rank1]: return self._call_impl(*args, **kwargs)
|
| 470 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 471 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 472 |
+
[rank1]: return inner()
|
| 473 |
+
[rank1]: ^^^^^^^
|
| 474 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 475 |
+
[rank1]: result = forward_call(*args, **kwargs)
|
| 476 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 477 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
|
| 478 |
+
[rank1]: return fn(*args, **kwargs)
|
| 479 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^
|
| 480 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 481 |
+
[rank1]: return self._call_impl(*args, **kwargs)
|
| 482 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 483 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 484 |
+
[rank1]: return forward_call(*args, **kwargs)
|
| 485 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 486 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 108, in forward
|
| 487 |
+
[rank1]: hidden_states = self.attn_norm(hidden_states)
|
| 488 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 109, in torch_dynamo_resume_in_forward_at_108
|
| 489 |
+
[rank1]: hidden_states, attentions, past_key_values = self.attn(
|
| 490 |
+
[rank1]: ^^^^^^^^^^
|
| 491 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 492 |
+
[rank1]: return self._call_impl(*args, **kwargs)
|
| 493 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 494 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 495 |
+
[rank1]: return forward_call(*args, **kwargs)
|
| 496 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 497 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 108, in forward
|
| 498 |
+
[rank1]: q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)
|
| 499 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 123, in torch_dynamo_resume_in_forward_at_108
|
| 500 |
+
[rank1]: o = parallel_nsa(
|
| 501 |
+
[rank1]: ^^^^^^^^^^^^^
|
| 502 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 838, in parallel_nsa
|
| 503 |
+
[rank1]: o_cmp, lse_cmp = parallel_nsa_compression(
|
| 504 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838
|
| 505 |
+
[rank1]: o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)
|
| 506 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1432, in __call__
|
| 507 |
+
[rank1]: return self._torchdynamo_orig_callable(
|
| 508 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 509 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1213, in __call__
|
| 510 |
+
[rank1]: result = self._inner_convert(
|
| 511 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^
|
| 512 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 598, in __call__
|
| 513 |
+
[rank1]: return _compile(
|
| 514 |
+
[rank1]: ^^^^^^^^^
|
| 515 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1110, in _compile
|
| 516 |
+
[rank1]: raise InternalTorchDynamoError(
|
| 517 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1059, in _compile
|
| 518 |
+
[rank1]: guarded_code = compile_inner(code, one_graph, hooks, transform)
|
| 519 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 520 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py", line 97, in wrapper_function
|
| 521 |
+
[rank1]: return function(*args, **kwargs)
|
| 522 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 523 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 761, in compile_inner
|
| 524 |
+
[rank1]: return _compile_inner(code, one_graph, hooks, transform)
|
| 525 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 526 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 797, in _compile_inner
|
| 527 |
+
[rank1]: out_code = transform_code_object(code, transform)
|
| 528 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 529 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py", line 1422, in transform_code_object
|
| 530 |
+
[rank1]: transformations(instructions, code_options)
|
| 531 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 257, in _fn
|
| 532 |
+
[rank1]: return fn(*args, **kwargs)
|
| 533 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^
|
| 534 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 715, in transform
|
| 535 |
+
[rank1]: tracer.run()
|
| 536 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 3498, in run
|
| 537 |
+
[rank1]: super().run()
|
| 538 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1337, in run
|
| 539 |
+
[rank1]: while self.step():
|
| 540 |
+
[rank1]: ^^^^^^^^^^^
|
| 541 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1246, in step
|
| 542 |
+
[rank1]: self.dispatch_table[inst.opcode](self, inst)
|
| 543 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 2157, in COMPARE_OP
|
| 544 |
+
[rank1]: self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))
|
| 545 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 546 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 547 |
+
[rank1]: return handler(tx, args, kwargs)
|
| 548 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 549 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 789, in <lambda>
|
| 550 |
+
[rank1]: return lambda tx, args, kwargs: obj.call_function(
|
| 551 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^
|
| 552 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 553 |
+
[rank1]: return handler(tx, args, kwargs)
|
| 554 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 555 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 945, in builtin_dispatch
|
| 556 |
+
[rank1]: rv = fn(tx, args, kwargs)
|
| 557 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^
|
| 558 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 839, in call_binop_handlers
|
| 559 |
+
[rank1]: rv = fn(tx, *args)
|
| 560 |
+
[rank1]: ^^^^^^^^^^^^^
|
| 561 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 533, in compare_by_value
|
| 562 |
+
[rank1]: return ConstantVariable(op(a.value, b.value))
|
| 563 |
+
[rank1]: ^^^^^^^^^^^^^^^^^^^^
|
| 564 |
+
[rank1]: torch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'
|
| 565 |
+
|
| 566 |
+
[rank1]: from user code:
|
| 567 |
+
[rank1]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857
|
| 568 |
+
[rank1]: if window_size > 0:
|
| 569 |
+
|
| 570 |
+
[rank1]: Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
|
| 571 |
+
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/1/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/2/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 488, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 424, in forward\n outputs = self.model(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 294, in forward\n hidden_states, attentions, past_key_values = layer(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 108, in forward\n hidden_states = self.attn_norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 109, in torch_dynamo_resume_in_forward_at_108\n hidden_states, attentions, past_key_values = self.attn(\n ^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 108, in forward\n q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 123, in torch_dynamo_resume_in_forward_at_108\n o = parallel_nsa(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 838, in parallel_nsa\n o_cmp, lse_cmp = parallel_nsa_compression(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838\n o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1432, in __call__\n return self._torchdynamo_orig_callable(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1213, in __call__\n result = self._inner_convert(\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 598, in __call__\n return _compile(\n ^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1110, in _compile\n raise InternalTorchDynamoError(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1059, in _compile\n guarded_code = compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py\", line 97, in wrapper_function\n return function(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 761, in compile_inner\n return _compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 797, in _compile_inner\n out_code = transform_code_object(code, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py\", line 1422, in transform_code_object\n transformations(instructions, code_options)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 257, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 715, in transform\n tracer.run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 3498, in run\n super().run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1337, in run\n while self.step():\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1246, in step\n self.dispatch_table[inst.opcode](self, inst)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 2157, in COMPARE_OP\n self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 789, in <lambda>\n return lambda tx, args, kwargs: obj.call_function(\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 945, in builtin_dispatch\n rv = fn(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 839, in call_binop_handlers\n rv = fn(tx, *args)\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 533, in compare_by_value\n return ConstantVariable(op(a.value, b.value))\n ^^^^^^^^^^^^^^^^^^^^\ntorch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n\n", "timestamp": "1753352475"}}}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/2/stderr.log
ADDED
|
@@ -0,0 +1,571 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-24 18:19:03,183 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-24 18:19:03,188 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "bfloat16",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 8,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 2,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-24 18:19:03,189 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-24 18:19:04,471 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-24 18:19:04,473 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-24 18:19:04,528 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-24 18:19:04,528 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-24 18:19:04,528 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-24 18:19:04,534 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-24 18:19:04,646 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-24 18:19:04,646 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
|
| 146 |
+
[titan] 2025-07-24 18:19:04,780 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 147 |
+
IterableDataset({
|
| 148 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 149 |
+
num_shards: 140
|
| 150 |
+
})
|
| 151 |
+
[titan] 2025-07-24 18:19:04,780 - root - INFO - Shuffling the dataset with seed 42
|
| 152 |
+
[titan] 2025-07-24 18:19:04,780 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 153 |
+
[titan] 2025-07-24 18:20:10,498 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
|
| 154 |
+
IterableDataset({
|
| 155 |
+
features: ['repo', 'content'],
|
| 156 |
+
num_shards: 1
|
| 157 |
+
})
|
| 158 |
+
[titan] 2025-07-24 18:20:10,498 - root - INFO - Shuffling the dataset with seed 42
|
| 159 |
+
[titan] 2025-07-24 18:20:10,498 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 160 |
+
[titan] 2025-07-24 18:20:10,846 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
|
| 161 |
+
IterableDataset({
|
| 162 |
+
features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
|
| 163 |
+
num_shards: 100
|
| 164 |
+
})
|
| 165 |
+
[titan] 2025-07-24 18:20:10,846 - root - INFO - Shuffling the dataset with seed 42
|
| 166 |
+
[titan] 2025-07-24 18:20:10,846 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 167 |
+
[titan] 2025-07-24 18:20:17,401 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
|
| 168 |
+
[titan] 2025-07-24 18:20:18,103 - root - INFO - IterableDataset({
|
| 169 |
+
features: ['text', 'content'],
|
| 170 |
+
num_shards: 256
|
| 171 |
+
})
|
| 172 |
+
[titan] 2025-07-24 18:20:18,229 - root - INFO - Building dataloader...
|
| 173 |
+
[titan] 2025-07-24 18:20:18,232 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json
|
| 174 |
+
[titan] 2025-07-24 18:20:18,234 - root - INFO - Building model from the config
|
| 175 |
+
[32mGatedDeltaNetConfig {
|
| 176 |
+
"allow_neg_eigval": false,
|
| 177 |
+
"architectures": [
|
| 178 |
+
"GatedDeltaNetForCausalLM"
|
| 179 |
+
],
|
| 180 |
+
"attn": {
|
| 181 |
+
"block_counts": 16,
|
| 182 |
+
"block_size": 64,
|
| 183 |
+
"layers": [
|
| 184 |
+
5,
|
| 185 |
+
11,
|
| 186 |
+
17,
|
| 187 |
+
23
|
| 188 |
+
],
|
| 189 |
+
"num_heads": 32,
|
| 190 |
+
"num_kv_heads": 2,
|
| 191 |
+
"qkv_bias": false,
|
| 192 |
+
"rope_theta": 160000.0,
|
| 193 |
+
"type": "nsa",
|
| 194 |
+
"window_size": null
|
| 195 |
+
},
|
| 196 |
+
"attn_mode": "chunk",
|
| 197 |
+
"bos_token_id": 1,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"eos_token_id": 2,
|
| 200 |
+
"expand_k": 1,
|
| 201 |
+
"expand_v": 1,
|
| 202 |
+
"fuse_cross_entropy": true,
|
| 203 |
+
"fuse_norm": true,
|
| 204 |
+
"fuse_swiglu": true,
|
| 205 |
+
"head_dim": 256,
|
| 206 |
+
"hidden_act": "swish",
|
| 207 |
+
"hidden_ratio": 4,
|
| 208 |
+
"hidden_size": 1024,
|
| 209 |
+
"initializer_range": 0.02,
|
| 210 |
+
"intermediate_size": null,
|
| 211 |
+
"max_position_embeddings": 8192,
|
| 212 |
+
"model_type": "gated_deltanet",
|
| 213 |
+
"norm_eps": 1e-06,
|
| 214 |
+
"norm_first": false,
|
| 215 |
+
"num_heads": 4,
|
| 216 |
+
"num_hidden_layers": 24,
|
| 217 |
+
"num_v_heads": null,
|
| 218 |
+
"qk_activation": "silu",
|
| 219 |
+
"qk_norm": "l2",
|
| 220 |
+
"tie_word_embeddings": false,
|
| 221 |
+
"torch_dtype": "bfloat16",
|
| 222 |
+
"transformers_version": "4.53.3",
|
| 223 |
+
"use_beta": true,
|
| 224 |
+
"use_cache": true,
|
| 225 |
+
"use_gate": true,
|
| 226 |
+
"use_l2warp": false,
|
| 227 |
+
"use_output_norm": true,
|
| 228 |
+
"use_short_conv": true,
|
| 229 |
+
"vocab_size": 32000
|
| 230 |
+
}
|
| 231 |
+
[39m
|
| 232 |
+
[titan] 2025-07-24 18:20:18,567 - root - INFO - [34m
|
| 233 |
+
GatedDeltaNetForCausalLM(
|
| 234 |
+
(model): GatedDeltaNetModel(
|
| 235 |
+
(embeddings): Embedding(32000, 1024)
|
| 236 |
+
(layers): ModuleList(
|
| 237 |
+
(0-4): 5 x GatedDeltaNetBlock(
|
| 238 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 239 |
+
(attn): GatedDeltaNet(
|
| 240 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 241 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 242 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 243 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 244 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 245 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 246 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 247 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 248 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 249 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 250 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 251 |
+
)
|
| 252 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 253 |
+
(mlp): GatedMLP(
|
| 254 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 255 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 256 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 257 |
+
(swiglu_linear): SwiGLULinear()
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(5): GatedDeltaNetBlock(
|
| 261 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 262 |
+
(attn): NativeSparseAttention(
|
| 263 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 264 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 265 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 266 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 267 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 268 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 269 |
+
)
|
| 270 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 271 |
+
(mlp): GatedMLP(
|
| 272 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 273 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 274 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 275 |
+
(swiglu_linear): SwiGLULinear()
|
| 276 |
+
)
|
| 277 |
+
)
|
| 278 |
+
(6-10): 5 x GatedDeltaNetBlock(
|
| 279 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 280 |
+
(attn): GatedDeltaNet(
|
| 281 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 282 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 283 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 284 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 285 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 286 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 287 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 288 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 289 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 290 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 291 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 292 |
+
)
|
| 293 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 294 |
+
(mlp): GatedMLP(
|
| 295 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 296 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 297 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 298 |
+
(swiglu_linear): SwiGLULinear()
|
| 299 |
+
)
|
| 300 |
+
)
|
| 301 |
+
(11): GatedDeltaNetBlock(
|
| 302 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 303 |
+
(attn): NativeSparseAttention(
|
| 304 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 305 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 306 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 307 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 308 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 309 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 310 |
+
)
|
| 311 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 312 |
+
(mlp): GatedMLP(
|
| 313 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 314 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 315 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 316 |
+
(swiglu_linear): SwiGLULinear()
|
| 317 |
+
)
|
| 318 |
+
)
|
| 319 |
+
(12-16): 5 x GatedDeltaNetBlock(
|
| 320 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 321 |
+
(attn): GatedDeltaNet(
|
| 322 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 323 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 324 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 325 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 326 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 327 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 328 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 329 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 330 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 331 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 332 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 333 |
+
)
|
| 334 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 335 |
+
(mlp): GatedMLP(
|
| 336 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 337 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 338 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 339 |
+
(swiglu_linear): SwiGLULinear()
|
| 340 |
+
)
|
| 341 |
+
)
|
| 342 |
+
(17): GatedDeltaNetBlock(
|
| 343 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 344 |
+
(attn): NativeSparseAttention(
|
| 345 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 346 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 347 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 348 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 349 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 350 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 351 |
+
)
|
| 352 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 353 |
+
(mlp): GatedMLP(
|
| 354 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 355 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 356 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 357 |
+
(swiglu_linear): SwiGLULinear()
|
| 358 |
+
)
|
| 359 |
+
)
|
| 360 |
+
(18-22): 5 x GatedDeltaNetBlock(
|
| 361 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 362 |
+
(attn): GatedDeltaNet(
|
| 363 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 364 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 365 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 366 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 367 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 368 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 369 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 370 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 371 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 372 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 373 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 374 |
+
)
|
| 375 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 376 |
+
(mlp): GatedMLP(
|
| 377 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 378 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 379 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 380 |
+
(swiglu_linear): SwiGLULinear()
|
| 381 |
+
)
|
| 382 |
+
)
|
| 383 |
+
(23): GatedDeltaNetBlock(
|
| 384 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 385 |
+
(attn): NativeSparseAttention(
|
| 386 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 387 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 388 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 389 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 390 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 391 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 392 |
+
)
|
| 393 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 394 |
+
(mlp): GatedMLP(
|
| 395 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 396 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 397 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 398 |
+
(swiglu_linear): SwiGLULinear()
|
| 399 |
+
)
|
| 400 |
+
)
|
| 401 |
+
)
|
| 402 |
+
(norm): RMSNorm(1024, eps=1e-06)
|
| 403 |
+
)
|
| 404 |
+
(lm_head): Linear(in_features=1024, out_features=32000, bias=False)
|
| 405 |
+
(criterion): FusedLinearCrossEntropyLoss()
|
| 406 |
+
)[39m
|
| 407 |
+
|
| 408 |
+
[titan] 2025-07-24 18:20:18,604 - root - INFO - Compiling each block with torch.compile
|
| 409 |
+
[titan] 2025-07-24 18:20:18,604 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 410 |
+
[titan] 2025-07-24 18:20:18,605 - root - INFO - Compiling the entire model with torch.compile
|
| 411 |
+
[titan] 2025-07-24 18:20:18,690 - root - INFO - Applied FSDP to the model
|
| 412 |
+
[titan] 2025-07-24 18:20:18,757 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `A_log` is a DTensor, skipping initialization
|
| 413 |
+
[titan] 2025-07-24 18:20:18,757 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `dt_bias` is a DTensor, skipping initialization
|
| 414 |
+
[titan] 2025-07-24 18:20:18,858 - root - INFO - CUDA memory usage for model: 0.10GiB(0.10%)
|
| 415 |
+
[titan] 2025-07-24 18:20:18,859 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
|
| 416 |
+
[titan] 2025-07-24 18:20:18,884 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
|
| 417 |
+
[titan] 2025-07-24 18:20:18,885 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 418 |
+
[titan] 2025-07-24 18:20:18,944 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 419 |
+
[titan] 2025-07-24 18:20:27,383 - root - INFO - [31m***** Running training *****[39m
|
| 420 |
+
[titan] 2025-07-24 18:20:27,385 - root - INFO - [32m Training starts at step 1
|
| 421 |
+
[titan] 2025-07-24 18:20:27,386 - root - INFO - [32m Number of tokens per sequence = 8,192
|
| 422 |
+
[titan] 2025-07-24 18:20:27,387 - root - INFO - [32m Gradient Accumulation steps = 2
|
| 423 |
+
[titan] 2025-07-24 18:20:27,387 - root - INFO - [32m Instantaneous batch size (per device) = 8
|
| 424 |
+
[titan] 2025-07-24 18:20:27,387 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
|
| 425 |
+
[titan] 2025-07-24 18:20:27,387 - root - INFO - [32m Total optimization steps = 95,366 (99,998,498,816 tokens)
|
| 426 |
+
[titan] 2025-07-24 18:20:27,387 - root - INFO - [32m Warmup steps = 100 (104,857,600 tokens)
|
| 427 |
+
[titan] 2025-07-24 18:20:27,387 - root - INFO - [32m Number of parameters = 396,695,712 [39m
|
| 428 |
+
[titan] 2025-07-24 18:20:27,387 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
|
| 429 |
+
/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 430 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 431 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 432 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 433 |
+
[rank2]: Traceback (most recent call last):
|
| 434 |
+
[rank2]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 435 |
+
[rank2]: File "<frozen runpy>", line 88, in _run_code
|
| 436 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 616, in <module>
|
| 437 |
+
[rank2]: main(config)
|
| 438 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 439 |
+
[rank2]: return f(*args, **kwargs)
|
| 440 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^
|
| 441 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 488, in main
|
| 442 |
+
[rank2]: output = model(
|
| 443 |
+
[rank2]: ^^^^^^
|
| 444 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 445 |
+
[rank2]: return self._call_impl(*args, **kwargs)
|
| 446 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 447 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 448 |
+
[rank2]: return inner()
|
| 449 |
+
[rank2]: ^^^^^^^
|
| 450 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 451 |
+
[rank2]: result = forward_call(*args, **kwargs)
|
| 452 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 453 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
|
| 454 |
+
[rank2]: return func(*args, **kwargs)
|
| 455 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^
|
| 456 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 424, in forward
|
| 457 |
+
[rank2]: outputs = self.model(
|
| 458 |
+
[rank2]: ^^^^^^^^^^^
|
| 459 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 460 |
+
[rank2]: return self._call_impl(*args, **kwargs)
|
| 461 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 462 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 463 |
+
[rank2]: return forward_call(*args, **kwargs)
|
| 464 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 465 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 294, in forward
|
| 466 |
+
[rank2]: hidden_states, attentions, past_key_values = layer(
|
| 467 |
+
[rank2]: ^^^^^^
|
| 468 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 469 |
+
[rank2]: return self._call_impl(*args, **kwargs)
|
| 470 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 471 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 472 |
+
[rank2]: return inner()
|
| 473 |
+
[rank2]: ^^^^^^^
|
| 474 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 475 |
+
[rank2]: result = forward_call(*args, **kwargs)
|
| 476 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 477 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
|
| 478 |
+
[rank2]: return fn(*args, **kwargs)
|
| 479 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^
|
| 480 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 481 |
+
[rank2]: return self._call_impl(*args, **kwargs)
|
| 482 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 483 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 484 |
+
[rank2]: return forward_call(*args, **kwargs)
|
| 485 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 486 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 108, in forward
|
| 487 |
+
[rank2]: hidden_states = self.attn_norm(hidden_states)
|
| 488 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 109, in torch_dynamo_resume_in_forward_at_108
|
| 489 |
+
[rank2]: hidden_states, attentions, past_key_values = self.attn(
|
| 490 |
+
[rank2]: ^^^^^^^^^^
|
| 491 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 492 |
+
[rank2]: return self._call_impl(*args, **kwargs)
|
| 493 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 494 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 495 |
+
[rank2]: return forward_call(*args, **kwargs)
|
| 496 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 497 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 108, in forward
|
| 498 |
+
[rank2]: q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)
|
| 499 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 123, in torch_dynamo_resume_in_forward_at_108
|
| 500 |
+
[rank2]: o = parallel_nsa(
|
| 501 |
+
[rank2]: ^^^^^^^^^^^^^
|
| 502 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 838, in parallel_nsa
|
| 503 |
+
[rank2]: o_cmp, lse_cmp = parallel_nsa_compression(
|
| 504 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838
|
| 505 |
+
[rank2]: o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)
|
| 506 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1432, in __call__
|
| 507 |
+
[rank2]: return self._torchdynamo_orig_callable(
|
| 508 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 509 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1213, in __call__
|
| 510 |
+
[rank2]: result = self._inner_convert(
|
| 511 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^
|
| 512 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 598, in __call__
|
| 513 |
+
[rank2]: return _compile(
|
| 514 |
+
[rank2]: ^^^^^^^^^
|
| 515 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1110, in _compile
|
| 516 |
+
[rank2]: raise InternalTorchDynamoError(
|
| 517 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1059, in _compile
|
| 518 |
+
[rank2]: guarded_code = compile_inner(code, one_graph, hooks, transform)
|
| 519 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 520 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py", line 97, in wrapper_function
|
| 521 |
+
[rank2]: return function(*args, **kwargs)
|
| 522 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 523 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 761, in compile_inner
|
| 524 |
+
[rank2]: return _compile_inner(code, one_graph, hooks, transform)
|
| 525 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 526 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 797, in _compile_inner
|
| 527 |
+
[rank2]: out_code = transform_code_object(code, transform)
|
| 528 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 529 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py", line 1422, in transform_code_object
|
| 530 |
+
[rank2]: transformations(instructions, code_options)
|
| 531 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 257, in _fn
|
| 532 |
+
[rank2]: return fn(*args, **kwargs)
|
| 533 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^
|
| 534 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 715, in transform
|
| 535 |
+
[rank2]: tracer.run()
|
| 536 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 3498, in run
|
| 537 |
+
[rank2]: super().run()
|
| 538 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1337, in run
|
| 539 |
+
[rank2]: while self.step():
|
| 540 |
+
[rank2]: ^^^^^^^^^^^
|
| 541 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1246, in step
|
| 542 |
+
[rank2]: self.dispatch_table[inst.opcode](self, inst)
|
| 543 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 2157, in COMPARE_OP
|
| 544 |
+
[rank2]: self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))
|
| 545 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 546 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 547 |
+
[rank2]: return handler(tx, args, kwargs)
|
| 548 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 549 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 789, in <lambda>
|
| 550 |
+
[rank2]: return lambda tx, args, kwargs: obj.call_function(
|
| 551 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^
|
| 552 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 553 |
+
[rank2]: return handler(tx, args, kwargs)
|
| 554 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 555 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 945, in builtin_dispatch
|
| 556 |
+
[rank2]: rv = fn(tx, args, kwargs)
|
| 557 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^
|
| 558 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 839, in call_binop_handlers
|
| 559 |
+
[rank2]: rv = fn(tx, *args)
|
| 560 |
+
[rank2]: ^^^^^^^^^^^^^
|
| 561 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 533, in compare_by_value
|
| 562 |
+
[rank2]: return ConstantVariable(op(a.value, b.value))
|
| 563 |
+
[rank2]: ^^^^^^^^^^^^^^^^^^^^
|
| 564 |
+
[rank2]: torch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'
|
| 565 |
+
|
| 566 |
+
[rank2]: from user code:
|
| 567 |
+
[rank2]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857
|
| 568 |
+
[rank2]: if window_size > 0:
|
| 569 |
+
|
| 570 |
+
[rank2]: Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
|
| 571 |
+
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/2/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/3/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 488, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 424, in forward\n outputs = self.model(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 294, in forward\n hidden_states, attentions, past_key_values = layer(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 108, in forward\n hidden_states = self.attn_norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 109, in torch_dynamo_resume_in_forward_at_108\n hidden_states, attentions, past_key_values = self.attn(\n ^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 108, in forward\n q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 123, in torch_dynamo_resume_in_forward_at_108\n o = parallel_nsa(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 838, in parallel_nsa\n o_cmp, lse_cmp = parallel_nsa_compression(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838\n o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1432, in __call__\n return self._torchdynamo_orig_callable(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1213, in __call__\n result = self._inner_convert(\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 598, in __call__\n return _compile(\n ^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1110, in _compile\n raise InternalTorchDynamoError(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1059, in _compile\n guarded_code = compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py\", line 97, in wrapper_function\n return function(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 761, in compile_inner\n return _compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 797, in _compile_inner\n out_code = transform_code_object(code, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py\", line 1422, in transform_code_object\n transformations(instructions, code_options)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 257, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 715, in transform\n tracer.run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 3498, in run\n super().run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1337, in run\n while self.step():\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1246, in step\n self.dispatch_table[inst.opcode](self, inst)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 2157, in COMPARE_OP\n self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 789, in <lambda>\n return lambda tx, args, kwargs: obj.call_function(\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 945, in builtin_dispatch\n rv = fn(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 839, in call_binop_handlers\n rv = fn(tx, *args)\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 533, in compare_by_value\n return ConstantVariable(op(a.value, b.value))\n ^^^^^^^^^^^^^^^^^^^^\ntorch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n\n", "timestamp": "1753352475"}}}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/3/stderr.log
ADDED
|
@@ -0,0 +1,571 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-24 18:19:05,907 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-24 18:19:05,908 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "bfloat16",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 8,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 2,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-24 18:19:05,908 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-24 18:19:06,594 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-24 18:19:06,597 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-24 18:19:06,654 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-24 18:19:06,654 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-24 18:19:06,654 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-24 18:19:06,686 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-24 18:19:06,794 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-24 18:19:06,794 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
|
| 146 |
+
[titan] 2025-07-24 18:19:06,906 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 147 |
+
IterableDataset({
|
| 148 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 149 |
+
num_shards: 140
|
| 150 |
+
})
|
| 151 |
+
[titan] 2025-07-24 18:19:06,906 - root - INFO - Shuffling the dataset with seed 42
|
| 152 |
+
[titan] 2025-07-24 18:19:06,906 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 153 |
+
[titan] 2025-07-24 18:19:57,464 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
|
| 154 |
+
IterableDataset({
|
| 155 |
+
features: ['repo', 'content'],
|
| 156 |
+
num_shards: 1
|
| 157 |
+
})
|
| 158 |
+
[titan] 2025-07-24 18:19:57,464 - root - INFO - Shuffling the dataset with seed 42
|
| 159 |
+
[titan] 2025-07-24 18:19:57,464 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 160 |
+
[titan] 2025-07-24 18:19:57,792 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
|
| 161 |
+
IterableDataset({
|
| 162 |
+
features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
|
| 163 |
+
num_shards: 100
|
| 164 |
+
})
|
| 165 |
+
[titan] 2025-07-24 18:19:57,792 - root - INFO - Shuffling the dataset with seed 42
|
| 166 |
+
[titan] 2025-07-24 18:19:57,793 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 167 |
+
[titan] 2025-07-24 18:20:03,981 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
|
| 168 |
+
[titan] 2025-07-24 18:20:04,684 - root - INFO - IterableDataset({
|
| 169 |
+
features: ['text', 'content'],
|
| 170 |
+
num_shards: 256
|
| 171 |
+
})
|
| 172 |
+
[titan] 2025-07-24 18:20:04,815 - root - INFO - Building dataloader...
|
| 173 |
+
[titan] 2025-07-24 18:20:04,817 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json
|
| 174 |
+
[titan] 2025-07-24 18:20:04,819 - root - INFO - Building model from the config
|
| 175 |
+
[32mGatedDeltaNetConfig {
|
| 176 |
+
"allow_neg_eigval": false,
|
| 177 |
+
"architectures": [
|
| 178 |
+
"GatedDeltaNetForCausalLM"
|
| 179 |
+
],
|
| 180 |
+
"attn": {
|
| 181 |
+
"block_counts": 16,
|
| 182 |
+
"block_size": 64,
|
| 183 |
+
"layers": [
|
| 184 |
+
5,
|
| 185 |
+
11,
|
| 186 |
+
17,
|
| 187 |
+
23
|
| 188 |
+
],
|
| 189 |
+
"num_heads": 32,
|
| 190 |
+
"num_kv_heads": 2,
|
| 191 |
+
"qkv_bias": false,
|
| 192 |
+
"rope_theta": 160000.0,
|
| 193 |
+
"type": "nsa",
|
| 194 |
+
"window_size": null
|
| 195 |
+
},
|
| 196 |
+
"attn_mode": "chunk",
|
| 197 |
+
"bos_token_id": 1,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"eos_token_id": 2,
|
| 200 |
+
"expand_k": 1,
|
| 201 |
+
"expand_v": 1,
|
| 202 |
+
"fuse_cross_entropy": true,
|
| 203 |
+
"fuse_norm": true,
|
| 204 |
+
"fuse_swiglu": true,
|
| 205 |
+
"head_dim": 256,
|
| 206 |
+
"hidden_act": "swish",
|
| 207 |
+
"hidden_ratio": 4,
|
| 208 |
+
"hidden_size": 1024,
|
| 209 |
+
"initializer_range": 0.02,
|
| 210 |
+
"intermediate_size": null,
|
| 211 |
+
"max_position_embeddings": 8192,
|
| 212 |
+
"model_type": "gated_deltanet",
|
| 213 |
+
"norm_eps": 1e-06,
|
| 214 |
+
"norm_first": false,
|
| 215 |
+
"num_heads": 4,
|
| 216 |
+
"num_hidden_layers": 24,
|
| 217 |
+
"num_v_heads": null,
|
| 218 |
+
"qk_activation": "silu",
|
| 219 |
+
"qk_norm": "l2",
|
| 220 |
+
"tie_word_embeddings": false,
|
| 221 |
+
"torch_dtype": "bfloat16",
|
| 222 |
+
"transformers_version": "4.53.3",
|
| 223 |
+
"use_beta": true,
|
| 224 |
+
"use_cache": true,
|
| 225 |
+
"use_gate": true,
|
| 226 |
+
"use_l2warp": false,
|
| 227 |
+
"use_output_norm": true,
|
| 228 |
+
"use_short_conv": true,
|
| 229 |
+
"vocab_size": 32000
|
| 230 |
+
}
|
| 231 |
+
[39m
|
| 232 |
+
[titan] 2025-07-24 18:20:05,139 - root - INFO - [34m
|
| 233 |
+
GatedDeltaNetForCausalLM(
|
| 234 |
+
(model): GatedDeltaNetModel(
|
| 235 |
+
(embeddings): Embedding(32000, 1024)
|
| 236 |
+
(layers): ModuleList(
|
| 237 |
+
(0-4): 5 x GatedDeltaNetBlock(
|
| 238 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 239 |
+
(attn): GatedDeltaNet(
|
| 240 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 241 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 242 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 243 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 244 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 245 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 246 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 247 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 248 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 249 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 250 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 251 |
+
)
|
| 252 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 253 |
+
(mlp): GatedMLP(
|
| 254 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 255 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 256 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 257 |
+
(swiglu_linear): SwiGLULinear()
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(5): GatedDeltaNetBlock(
|
| 261 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 262 |
+
(attn): NativeSparseAttention(
|
| 263 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 264 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 265 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 266 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 267 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 268 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 269 |
+
)
|
| 270 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 271 |
+
(mlp): GatedMLP(
|
| 272 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 273 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 274 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 275 |
+
(swiglu_linear): SwiGLULinear()
|
| 276 |
+
)
|
| 277 |
+
)
|
| 278 |
+
(6-10): 5 x GatedDeltaNetBlock(
|
| 279 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 280 |
+
(attn): GatedDeltaNet(
|
| 281 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 282 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 283 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 284 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 285 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 286 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 287 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 288 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 289 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 290 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 291 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 292 |
+
)
|
| 293 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 294 |
+
(mlp): GatedMLP(
|
| 295 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 296 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 297 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 298 |
+
(swiglu_linear): SwiGLULinear()
|
| 299 |
+
)
|
| 300 |
+
)
|
| 301 |
+
(11): GatedDeltaNetBlock(
|
| 302 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 303 |
+
(attn): NativeSparseAttention(
|
| 304 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 305 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 306 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 307 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 308 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 309 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 310 |
+
)
|
| 311 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 312 |
+
(mlp): GatedMLP(
|
| 313 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 314 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 315 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 316 |
+
(swiglu_linear): SwiGLULinear()
|
| 317 |
+
)
|
| 318 |
+
)
|
| 319 |
+
(12-16): 5 x GatedDeltaNetBlock(
|
| 320 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 321 |
+
(attn): GatedDeltaNet(
|
| 322 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 323 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 324 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 325 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 326 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 327 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 328 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 329 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 330 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 331 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 332 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 333 |
+
)
|
| 334 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 335 |
+
(mlp): GatedMLP(
|
| 336 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 337 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 338 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 339 |
+
(swiglu_linear): SwiGLULinear()
|
| 340 |
+
)
|
| 341 |
+
)
|
| 342 |
+
(17): GatedDeltaNetBlock(
|
| 343 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 344 |
+
(attn): NativeSparseAttention(
|
| 345 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 346 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 347 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 348 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 349 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 350 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 351 |
+
)
|
| 352 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 353 |
+
(mlp): GatedMLP(
|
| 354 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 355 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 356 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 357 |
+
(swiglu_linear): SwiGLULinear()
|
| 358 |
+
)
|
| 359 |
+
)
|
| 360 |
+
(18-22): 5 x GatedDeltaNetBlock(
|
| 361 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 362 |
+
(attn): GatedDeltaNet(
|
| 363 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 364 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 365 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 366 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 367 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 368 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 369 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 370 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 371 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 372 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 373 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 374 |
+
)
|
| 375 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 376 |
+
(mlp): GatedMLP(
|
| 377 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 378 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 379 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 380 |
+
(swiglu_linear): SwiGLULinear()
|
| 381 |
+
)
|
| 382 |
+
)
|
| 383 |
+
(23): GatedDeltaNetBlock(
|
| 384 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 385 |
+
(attn): NativeSparseAttention(
|
| 386 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 387 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 388 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 389 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 390 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 391 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 392 |
+
)
|
| 393 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 394 |
+
(mlp): GatedMLP(
|
| 395 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 396 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 397 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 398 |
+
(swiglu_linear): SwiGLULinear()
|
| 399 |
+
)
|
| 400 |
+
)
|
| 401 |
+
)
|
| 402 |
+
(norm): RMSNorm(1024, eps=1e-06)
|
| 403 |
+
)
|
| 404 |
+
(lm_head): Linear(in_features=1024, out_features=32000, bias=False)
|
| 405 |
+
(criterion): FusedLinearCrossEntropyLoss()
|
| 406 |
+
)[39m
|
| 407 |
+
|
| 408 |
+
[titan] 2025-07-24 18:20:05,174 - root - INFO - Compiling each block with torch.compile
|
| 409 |
+
[titan] 2025-07-24 18:20:05,174 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 410 |
+
[titan] 2025-07-24 18:20:05,174 - root - INFO - Compiling the entire model with torch.compile
|
| 411 |
+
[titan] 2025-07-24 18:20:05,254 - root - INFO - Applied FSDP to the model
|
| 412 |
+
[titan] 2025-07-24 18:20:05,399 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `A_log` is a DTensor, skipping initialization
|
| 413 |
+
[titan] 2025-07-24 18:20:05,399 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `dt_bias` is a DTensor, skipping initialization
|
| 414 |
+
[titan] 2025-07-24 18:20:05,495 - root - INFO - CUDA memory usage for model: 0.10GiB(0.10%)
|
| 415 |
+
[titan] 2025-07-24 18:20:05,497 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
|
| 416 |
+
[titan] 2025-07-24 18:20:05,520 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
|
| 417 |
+
[titan] 2025-07-24 18:20:05,521 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 418 |
+
[titan] 2025-07-24 18:20:05,590 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 419 |
+
[titan] 2025-07-24 18:20:11,953 - root - INFO - [31m***** Running training *****[39m
|
| 420 |
+
[titan] 2025-07-24 18:20:11,955 - root - INFO - [32m Training starts at step 1
|
| 421 |
+
[titan] 2025-07-24 18:20:11,956 - root - INFO - [32m Number of tokens per sequence = 8,192
|
| 422 |
+
[titan] 2025-07-24 18:20:11,956 - root - INFO - [32m Gradient Accumulation steps = 2
|
| 423 |
+
[titan] 2025-07-24 18:20:11,957 - root - INFO - [32m Instantaneous batch size (per device) = 8
|
| 424 |
+
[titan] 2025-07-24 18:20:11,957 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
|
| 425 |
+
[titan] 2025-07-24 18:20:11,957 - root - INFO - [32m Total optimization steps = 95,366 (99,998,498,816 tokens)
|
| 426 |
+
[titan] 2025-07-24 18:20:11,957 - root - INFO - [32m Warmup steps = 100 (104,857,600 tokens)
|
| 427 |
+
[titan] 2025-07-24 18:20:11,957 - root - INFO - [32m Number of parameters = 396,695,712 [39m
|
| 428 |
+
[titan] 2025-07-24 18:20:11,957 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
|
| 429 |
+
/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 430 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 431 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 432 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 433 |
+
[rank3]: Traceback (most recent call last):
|
| 434 |
+
[rank3]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 435 |
+
[rank3]: File "<frozen runpy>", line 88, in _run_code
|
| 436 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 616, in <module>
|
| 437 |
+
[rank3]: main(config)
|
| 438 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 439 |
+
[rank3]: return f(*args, **kwargs)
|
| 440 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^
|
| 441 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 488, in main
|
| 442 |
+
[rank3]: output = model(
|
| 443 |
+
[rank3]: ^^^^^^
|
| 444 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 445 |
+
[rank3]: return self._call_impl(*args, **kwargs)
|
| 446 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 447 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 448 |
+
[rank3]: return inner()
|
| 449 |
+
[rank3]: ^^^^^^^
|
| 450 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 451 |
+
[rank3]: result = forward_call(*args, **kwargs)
|
| 452 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 453 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
|
| 454 |
+
[rank3]: return func(*args, **kwargs)
|
| 455 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^
|
| 456 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 424, in forward
|
| 457 |
+
[rank3]: outputs = self.model(
|
| 458 |
+
[rank3]: ^^^^^^^^^^^
|
| 459 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 460 |
+
[rank3]: return self._call_impl(*args, **kwargs)
|
| 461 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 462 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 463 |
+
[rank3]: return forward_call(*args, **kwargs)
|
| 464 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 465 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 294, in forward
|
| 466 |
+
[rank3]: hidden_states, attentions, past_key_values = layer(
|
| 467 |
+
[rank3]: ^^^^^^
|
| 468 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 469 |
+
[rank3]: return self._call_impl(*args, **kwargs)
|
| 470 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 471 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 472 |
+
[rank3]: return inner()
|
| 473 |
+
[rank3]: ^^^^^^^
|
| 474 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 475 |
+
[rank3]: result = forward_call(*args, **kwargs)
|
| 476 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 477 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
|
| 478 |
+
[rank3]: return fn(*args, **kwargs)
|
| 479 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^
|
| 480 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 481 |
+
[rank3]: return self._call_impl(*args, **kwargs)
|
| 482 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 483 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 484 |
+
[rank3]: return forward_call(*args, **kwargs)
|
| 485 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 486 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 108, in forward
|
| 487 |
+
[rank3]: hidden_states = self.attn_norm(hidden_states)
|
| 488 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 109, in torch_dynamo_resume_in_forward_at_108
|
| 489 |
+
[rank3]: hidden_states, attentions, past_key_values = self.attn(
|
| 490 |
+
[rank3]: ^^^^^^^^^^
|
| 491 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 492 |
+
[rank3]: return self._call_impl(*args, **kwargs)
|
| 493 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 494 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 495 |
+
[rank3]: return forward_call(*args, **kwargs)
|
| 496 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 497 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 108, in forward
|
| 498 |
+
[rank3]: q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)
|
| 499 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 123, in torch_dynamo_resume_in_forward_at_108
|
| 500 |
+
[rank3]: o = parallel_nsa(
|
| 501 |
+
[rank3]: ^^^^^^^^^^^^^
|
| 502 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 838, in parallel_nsa
|
| 503 |
+
[rank3]: o_cmp, lse_cmp = parallel_nsa_compression(
|
| 504 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838
|
| 505 |
+
[rank3]: o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)
|
| 506 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1432, in __call__
|
| 507 |
+
[rank3]: return self._torchdynamo_orig_callable(
|
| 508 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 509 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1213, in __call__
|
| 510 |
+
[rank3]: result = self._inner_convert(
|
| 511 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^
|
| 512 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 598, in __call__
|
| 513 |
+
[rank3]: return _compile(
|
| 514 |
+
[rank3]: ^^^^^^^^^
|
| 515 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1110, in _compile
|
| 516 |
+
[rank3]: raise InternalTorchDynamoError(
|
| 517 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1059, in _compile
|
| 518 |
+
[rank3]: guarded_code = compile_inner(code, one_graph, hooks, transform)
|
| 519 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 520 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py", line 97, in wrapper_function
|
| 521 |
+
[rank3]: return function(*args, **kwargs)
|
| 522 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 523 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 761, in compile_inner
|
| 524 |
+
[rank3]: return _compile_inner(code, one_graph, hooks, transform)
|
| 525 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 526 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 797, in _compile_inner
|
| 527 |
+
[rank3]: out_code = transform_code_object(code, transform)
|
| 528 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 529 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py", line 1422, in transform_code_object
|
| 530 |
+
[rank3]: transformations(instructions, code_options)
|
| 531 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 257, in _fn
|
| 532 |
+
[rank3]: return fn(*args, **kwargs)
|
| 533 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^
|
| 534 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 715, in transform
|
| 535 |
+
[rank3]: tracer.run()
|
| 536 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 3498, in run
|
| 537 |
+
[rank3]: super().run()
|
| 538 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1337, in run
|
| 539 |
+
[rank3]: while self.step():
|
| 540 |
+
[rank3]: ^^^^^^^^^^^
|
| 541 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1246, in step
|
| 542 |
+
[rank3]: self.dispatch_table[inst.opcode](self, inst)
|
| 543 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 2157, in COMPARE_OP
|
| 544 |
+
[rank3]: self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))
|
| 545 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 546 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 547 |
+
[rank3]: return handler(tx, args, kwargs)
|
| 548 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 549 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 789, in <lambda>
|
| 550 |
+
[rank3]: return lambda tx, args, kwargs: obj.call_function(
|
| 551 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^
|
| 552 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 553 |
+
[rank3]: return handler(tx, args, kwargs)
|
| 554 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 555 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 945, in builtin_dispatch
|
| 556 |
+
[rank3]: rv = fn(tx, args, kwargs)
|
| 557 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^
|
| 558 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 839, in call_binop_handlers
|
| 559 |
+
[rank3]: rv = fn(tx, *args)
|
| 560 |
+
[rank3]: ^^^^^^^^^^^^^
|
| 561 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 533, in compare_by_value
|
| 562 |
+
[rank3]: return ConstantVariable(op(a.value, b.value))
|
| 563 |
+
[rank3]: ^^^^^^^^^^^^^^^^^^^^
|
| 564 |
+
[rank3]: torch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'
|
| 565 |
+
|
| 566 |
+
[rank3]: from user code:
|
| 567 |
+
[rank3]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857
|
| 568 |
+
[rank3]: if window_size > 0:
|
| 569 |
+
|
| 570 |
+
[rank3]: Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
|
| 571 |
+
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/3/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/4/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 488, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 424, in forward\n outputs = self.model(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 294, in forward\n hidden_states, attentions, past_key_values = layer(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 108, in forward\n hidden_states = self.attn_norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 109, in torch_dynamo_resume_in_forward_at_108\n hidden_states, attentions, past_key_values = self.attn(\n ^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 108, in forward\n q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 123, in torch_dynamo_resume_in_forward_at_108\n o = parallel_nsa(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 838, in parallel_nsa\n o_cmp, lse_cmp = parallel_nsa_compression(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838\n o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1432, in __call__\n return self._torchdynamo_orig_callable(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1213, in __call__\n result = self._inner_convert(\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 598, in __call__\n return _compile(\n ^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1110, in _compile\n raise InternalTorchDynamoError(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1059, in _compile\n guarded_code = compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py\", line 97, in wrapper_function\n return function(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 761, in compile_inner\n return _compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 797, in _compile_inner\n out_code = transform_code_object(code, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py\", line 1422, in transform_code_object\n transformations(instructions, code_options)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 257, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 715, in transform\n tracer.run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 3498, in run\n super().run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1337, in run\n while self.step():\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1246, in step\n self.dispatch_table[inst.opcode](self, inst)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 2157, in COMPARE_OP\n self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 789, in <lambda>\n return lambda tx, args, kwargs: obj.call_function(\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 945, in builtin_dispatch\n rv = fn(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 839, in call_binop_handlers\n rv = fn(tx, *args)\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 533, in compare_by_value\n return ConstantVariable(op(a.value, b.value))\n ^^^^^^^^^^^^^^^^^^^^\ntorch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n\n", "timestamp": "1753352474"}}}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/4/stderr.log
ADDED
|
@@ -0,0 +1,571 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-24 18:19:07,134 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-24 18:19:07,134 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "bfloat16",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 8,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 2,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-24 18:19:07,135 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-24 18:19:07,697 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-24 18:19:07,700 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-24 18:19:07,780 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-24 18:19:07,780 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-24 18:19:07,780 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-24 18:19:07,847 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-24 18:19:07,960 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-24 18:19:07,960 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
|
| 146 |
+
[titan] 2025-07-24 18:19:08,077 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 147 |
+
IterableDataset({
|
| 148 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 149 |
+
num_shards: 140
|
| 150 |
+
})
|
| 151 |
+
[titan] 2025-07-24 18:19:08,077 - root - INFO - Shuffling the dataset with seed 42
|
| 152 |
+
[titan] 2025-07-24 18:19:08,078 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 153 |
+
[titan] 2025-07-24 18:19:59,309 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
|
| 154 |
+
IterableDataset({
|
| 155 |
+
features: ['repo', 'content'],
|
| 156 |
+
num_shards: 1
|
| 157 |
+
})
|
| 158 |
+
[titan] 2025-07-24 18:19:59,309 - root - INFO - Shuffling the dataset with seed 42
|
| 159 |
+
[titan] 2025-07-24 18:19:59,309 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 160 |
+
[titan] 2025-07-24 18:19:59,651 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
|
| 161 |
+
IterableDataset({
|
| 162 |
+
features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
|
| 163 |
+
num_shards: 100
|
| 164 |
+
})
|
| 165 |
+
[titan] 2025-07-24 18:19:59,652 - root - INFO - Shuffling the dataset with seed 42
|
| 166 |
+
[titan] 2025-07-24 18:19:59,652 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 167 |
+
[titan] 2025-07-24 18:20:05,814 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
|
| 168 |
+
[titan] 2025-07-24 18:20:06,503 - root - INFO - IterableDataset({
|
| 169 |
+
features: ['text', 'content'],
|
| 170 |
+
num_shards: 256
|
| 171 |
+
})
|
| 172 |
+
[titan] 2025-07-24 18:20:06,623 - root - INFO - Building dataloader...
|
| 173 |
+
[titan] 2025-07-24 18:20:06,625 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json
|
| 174 |
+
[titan] 2025-07-24 18:20:06,627 - root - INFO - Building model from the config
|
| 175 |
+
[32mGatedDeltaNetConfig {
|
| 176 |
+
"allow_neg_eigval": false,
|
| 177 |
+
"architectures": [
|
| 178 |
+
"GatedDeltaNetForCausalLM"
|
| 179 |
+
],
|
| 180 |
+
"attn": {
|
| 181 |
+
"block_counts": 16,
|
| 182 |
+
"block_size": 64,
|
| 183 |
+
"layers": [
|
| 184 |
+
5,
|
| 185 |
+
11,
|
| 186 |
+
17,
|
| 187 |
+
23
|
| 188 |
+
],
|
| 189 |
+
"num_heads": 32,
|
| 190 |
+
"num_kv_heads": 2,
|
| 191 |
+
"qkv_bias": false,
|
| 192 |
+
"rope_theta": 160000.0,
|
| 193 |
+
"type": "nsa",
|
| 194 |
+
"window_size": null
|
| 195 |
+
},
|
| 196 |
+
"attn_mode": "chunk",
|
| 197 |
+
"bos_token_id": 1,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"eos_token_id": 2,
|
| 200 |
+
"expand_k": 1,
|
| 201 |
+
"expand_v": 1,
|
| 202 |
+
"fuse_cross_entropy": true,
|
| 203 |
+
"fuse_norm": true,
|
| 204 |
+
"fuse_swiglu": true,
|
| 205 |
+
"head_dim": 256,
|
| 206 |
+
"hidden_act": "swish",
|
| 207 |
+
"hidden_ratio": 4,
|
| 208 |
+
"hidden_size": 1024,
|
| 209 |
+
"initializer_range": 0.02,
|
| 210 |
+
"intermediate_size": null,
|
| 211 |
+
"max_position_embeddings": 8192,
|
| 212 |
+
"model_type": "gated_deltanet",
|
| 213 |
+
"norm_eps": 1e-06,
|
| 214 |
+
"norm_first": false,
|
| 215 |
+
"num_heads": 4,
|
| 216 |
+
"num_hidden_layers": 24,
|
| 217 |
+
"num_v_heads": null,
|
| 218 |
+
"qk_activation": "silu",
|
| 219 |
+
"qk_norm": "l2",
|
| 220 |
+
"tie_word_embeddings": false,
|
| 221 |
+
"torch_dtype": "bfloat16",
|
| 222 |
+
"transformers_version": "4.53.3",
|
| 223 |
+
"use_beta": true,
|
| 224 |
+
"use_cache": true,
|
| 225 |
+
"use_gate": true,
|
| 226 |
+
"use_l2warp": false,
|
| 227 |
+
"use_output_norm": true,
|
| 228 |
+
"use_short_conv": true,
|
| 229 |
+
"vocab_size": 32000
|
| 230 |
+
}
|
| 231 |
+
[39m
|
| 232 |
+
[titan] 2025-07-24 18:20:06,946 - root - INFO - [34m
|
| 233 |
+
GatedDeltaNetForCausalLM(
|
| 234 |
+
(model): GatedDeltaNetModel(
|
| 235 |
+
(embeddings): Embedding(32000, 1024)
|
| 236 |
+
(layers): ModuleList(
|
| 237 |
+
(0-4): 5 x GatedDeltaNetBlock(
|
| 238 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 239 |
+
(attn): GatedDeltaNet(
|
| 240 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 241 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 242 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 243 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 244 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 245 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 246 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 247 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 248 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 249 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 250 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 251 |
+
)
|
| 252 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 253 |
+
(mlp): GatedMLP(
|
| 254 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 255 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 256 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 257 |
+
(swiglu_linear): SwiGLULinear()
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(5): GatedDeltaNetBlock(
|
| 261 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 262 |
+
(attn): NativeSparseAttention(
|
| 263 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 264 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 265 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 266 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 267 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 268 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 269 |
+
)
|
| 270 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 271 |
+
(mlp): GatedMLP(
|
| 272 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 273 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 274 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 275 |
+
(swiglu_linear): SwiGLULinear()
|
| 276 |
+
)
|
| 277 |
+
)
|
| 278 |
+
(6-10): 5 x GatedDeltaNetBlock(
|
| 279 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 280 |
+
(attn): GatedDeltaNet(
|
| 281 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 282 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 283 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 284 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 285 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 286 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 287 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 288 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 289 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 290 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 291 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 292 |
+
)
|
| 293 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 294 |
+
(mlp): GatedMLP(
|
| 295 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 296 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 297 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 298 |
+
(swiglu_linear): SwiGLULinear()
|
| 299 |
+
)
|
| 300 |
+
)
|
| 301 |
+
(11): GatedDeltaNetBlock(
|
| 302 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 303 |
+
(attn): NativeSparseAttention(
|
| 304 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 305 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 306 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 307 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 308 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 309 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 310 |
+
)
|
| 311 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 312 |
+
(mlp): GatedMLP(
|
| 313 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 314 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 315 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 316 |
+
(swiglu_linear): SwiGLULinear()
|
| 317 |
+
)
|
| 318 |
+
)
|
| 319 |
+
(12-16): 5 x GatedDeltaNetBlock(
|
| 320 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 321 |
+
(attn): GatedDeltaNet(
|
| 322 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 323 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 324 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 325 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 326 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 327 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 328 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 329 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 330 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 331 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 332 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 333 |
+
)
|
| 334 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 335 |
+
(mlp): GatedMLP(
|
| 336 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 337 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 338 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 339 |
+
(swiglu_linear): SwiGLULinear()
|
| 340 |
+
)
|
| 341 |
+
)
|
| 342 |
+
(17): GatedDeltaNetBlock(
|
| 343 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 344 |
+
(attn): NativeSparseAttention(
|
| 345 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 346 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 347 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 348 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 349 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 350 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 351 |
+
)
|
| 352 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 353 |
+
(mlp): GatedMLP(
|
| 354 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 355 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 356 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 357 |
+
(swiglu_linear): SwiGLULinear()
|
| 358 |
+
)
|
| 359 |
+
)
|
| 360 |
+
(18-22): 5 x GatedDeltaNetBlock(
|
| 361 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 362 |
+
(attn): GatedDeltaNet(
|
| 363 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 364 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 365 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 366 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 367 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 368 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 369 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 370 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 371 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 372 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 373 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 374 |
+
)
|
| 375 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 376 |
+
(mlp): GatedMLP(
|
| 377 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 378 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 379 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 380 |
+
(swiglu_linear): SwiGLULinear()
|
| 381 |
+
)
|
| 382 |
+
)
|
| 383 |
+
(23): GatedDeltaNetBlock(
|
| 384 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 385 |
+
(attn): NativeSparseAttention(
|
| 386 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 387 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 388 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 389 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 390 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 391 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 392 |
+
)
|
| 393 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 394 |
+
(mlp): GatedMLP(
|
| 395 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 396 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 397 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 398 |
+
(swiglu_linear): SwiGLULinear()
|
| 399 |
+
)
|
| 400 |
+
)
|
| 401 |
+
)
|
| 402 |
+
(norm): RMSNorm(1024, eps=1e-06)
|
| 403 |
+
)
|
| 404 |
+
(lm_head): Linear(in_features=1024, out_features=32000, bias=False)
|
| 405 |
+
(criterion): FusedLinearCrossEntropyLoss()
|
| 406 |
+
)[39m
|
| 407 |
+
|
| 408 |
+
[titan] 2025-07-24 18:20:06,981 - root - INFO - Compiling each block with torch.compile
|
| 409 |
+
[titan] 2025-07-24 18:20:06,981 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 410 |
+
[titan] 2025-07-24 18:20:06,981 - root - INFO - Compiling the entire model with torch.compile
|
| 411 |
+
[titan] 2025-07-24 18:20:07,063 - root - INFO - Applied FSDP to the model
|
| 412 |
+
[titan] 2025-07-24 18:20:07,129 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `A_log` is a DTensor, skipping initialization
|
| 413 |
+
[titan] 2025-07-24 18:20:07,129 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `dt_bias` is a DTensor, skipping initialization
|
| 414 |
+
[titan] 2025-07-24 18:20:07,218 - root - INFO - CUDA memory usage for model: 0.10GiB(0.10%)
|
| 415 |
+
[titan] 2025-07-24 18:20:07,220 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
|
| 416 |
+
[titan] 2025-07-24 18:20:07,243 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
|
| 417 |
+
[titan] 2025-07-24 18:20:07,243 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 418 |
+
[titan] 2025-07-24 18:20:07,298 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 419 |
+
[titan] 2025-07-24 18:20:13,800 - root - INFO - [31m***** Running training *****[39m
|
| 420 |
+
[titan] 2025-07-24 18:20:13,803 - root - INFO - [32m Training starts at step 1
|
| 421 |
+
[titan] 2025-07-24 18:20:13,803 - root - INFO - [32m Number of tokens per sequence = 8,192
|
| 422 |
+
[titan] 2025-07-24 18:20:13,803 - root - INFO - [32m Gradient Accumulation steps = 2
|
| 423 |
+
[titan] 2025-07-24 18:20:13,803 - root - INFO - [32m Instantaneous batch size (per device) = 8
|
| 424 |
+
[titan] 2025-07-24 18:20:13,803 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
|
| 425 |
+
[titan] 2025-07-24 18:20:13,803 - root - INFO - [32m Total optimization steps = 95,366 (99,998,498,816 tokens)
|
| 426 |
+
[titan] 2025-07-24 18:20:13,804 - root - INFO - [32m Warmup steps = 100 (104,857,600 tokens)
|
| 427 |
+
[titan] 2025-07-24 18:20:13,804 - root - INFO - [32m Number of parameters = 396,695,712 [39m
|
| 428 |
+
[titan] 2025-07-24 18:20:13,804 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
|
| 429 |
+
/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 430 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 431 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 432 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 433 |
+
[rank4]: Traceback (most recent call last):
|
| 434 |
+
[rank4]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 435 |
+
[rank4]: File "<frozen runpy>", line 88, in _run_code
|
| 436 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 616, in <module>
|
| 437 |
+
[rank4]: main(config)
|
| 438 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 439 |
+
[rank4]: return f(*args, **kwargs)
|
| 440 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^
|
| 441 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 488, in main
|
| 442 |
+
[rank4]: output = model(
|
| 443 |
+
[rank4]: ^^^^^^
|
| 444 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 445 |
+
[rank4]: return self._call_impl(*args, **kwargs)
|
| 446 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 447 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 448 |
+
[rank4]: return inner()
|
| 449 |
+
[rank4]: ^^^^^^^
|
| 450 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 451 |
+
[rank4]: result = forward_call(*args, **kwargs)
|
| 452 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 453 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
|
| 454 |
+
[rank4]: return func(*args, **kwargs)
|
| 455 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^
|
| 456 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 424, in forward
|
| 457 |
+
[rank4]: outputs = self.model(
|
| 458 |
+
[rank4]: ^^^^^^^^^^^
|
| 459 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 460 |
+
[rank4]: return self._call_impl(*args, **kwargs)
|
| 461 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 462 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 463 |
+
[rank4]: return forward_call(*args, **kwargs)
|
| 464 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 465 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 294, in forward
|
| 466 |
+
[rank4]: hidden_states, attentions, past_key_values = layer(
|
| 467 |
+
[rank4]: ^^^^^^
|
| 468 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 469 |
+
[rank4]: return self._call_impl(*args, **kwargs)
|
| 470 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 471 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 472 |
+
[rank4]: return inner()
|
| 473 |
+
[rank4]: ^^^^^^^
|
| 474 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 475 |
+
[rank4]: result = forward_call(*args, **kwargs)
|
| 476 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 477 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
|
| 478 |
+
[rank4]: return fn(*args, **kwargs)
|
| 479 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^
|
| 480 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 481 |
+
[rank4]: return self._call_impl(*args, **kwargs)
|
| 482 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 483 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 484 |
+
[rank4]: return forward_call(*args, **kwargs)
|
| 485 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 486 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 108, in forward
|
| 487 |
+
[rank4]: hidden_states = self.attn_norm(hidden_states)
|
| 488 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 109, in torch_dynamo_resume_in_forward_at_108
|
| 489 |
+
[rank4]: hidden_states, attentions, past_key_values = self.attn(
|
| 490 |
+
[rank4]: ^^^^^^^^^^
|
| 491 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 492 |
+
[rank4]: return self._call_impl(*args, **kwargs)
|
| 493 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 494 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 495 |
+
[rank4]: return forward_call(*args, **kwargs)
|
| 496 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 497 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 108, in forward
|
| 498 |
+
[rank4]: q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)
|
| 499 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 123, in torch_dynamo_resume_in_forward_at_108
|
| 500 |
+
[rank4]: o = parallel_nsa(
|
| 501 |
+
[rank4]: ^^^^^^^^^^^^^
|
| 502 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 838, in parallel_nsa
|
| 503 |
+
[rank4]: o_cmp, lse_cmp = parallel_nsa_compression(
|
| 504 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838
|
| 505 |
+
[rank4]: o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)
|
| 506 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1432, in __call__
|
| 507 |
+
[rank4]: return self._torchdynamo_orig_callable(
|
| 508 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 509 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1213, in __call__
|
| 510 |
+
[rank4]: result = self._inner_convert(
|
| 511 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^
|
| 512 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 598, in __call__
|
| 513 |
+
[rank4]: return _compile(
|
| 514 |
+
[rank4]: ^^^^^^^^^
|
| 515 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1110, in _compile
|
| 516 |
+
[rank4]: raise InternalTorchDynamoError(
|
| 517 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1059, in _compile
|
| 518 |
+
[rank4]: guarded_code = compile_inner(code, one_graph, hooks, transform)
|
| 519 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 520 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py", line 97, in wrapper_function
|
| 521 |
+
[rank4]: return function(*args, **kwargs)
|
| 522 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 523 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 761, in compile_inner
|
| 524 |
+
[rank4]: return _compile_inner(code, one_graph, hooks, transform)
|
| 525 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 526 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 797, in _compile_inner
|
| 527 |
+
[rank4]: out_code = transform_code_object(code, transform)
|
| 528 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 529 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py", line 1422, in transform_code_object
|
| 530 |
+
[rank4]: transformations(instructions, code_options)
|
| 531 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 257, in _fn
|
| 532 |
+
[rank4]: return fn(*args, **kwargs)
|
| 533 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^
|
| 534 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 715, in transform
|
| 535 |
+
[rank4]: tracer.run()
|
| 536 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 3498, in run
|
| 537 |
+
[rank4]: super().run()
|
| 538 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1337, in run
|
| 539 |
+
[rank4]: while self.step():
|
| 540 |
+
[rank4]: ^^^^^^^^^^^
|
| 541 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1246, in step
|
| 542 |
+
[rank4]: self.dispatch_table[inst.opcode](self, inst)
|
| 543 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 2157, in COMPARE_OP
|
| 544 |
+
[rank4]: self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))
|
| 545 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 546 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 547 |
+
[rank4]: return handler(tx, args, kwargs)
|
| 548 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 549 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 789, in <lambda>
|
| 550 |
+
[rank4]: return lambda tx, args, kwargs: obj.call_function(
|
| 551 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^
|
| 552 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 553 |
+
[rank4]: return handler(tx, args, kwargs)
|
| 554 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 555 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 945, in builtin_dispatch
|
| 556 |
+
[rank4]: rv = fn(tx, args, kwargs)
|
| 557 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^
|
| 558 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 839, in call_binop_handlers
|
| 559 |
+
[rank4]: rv = fn(tx, *args)
|
| 560 |
+
[rank4]: ^^^^^^^^^^^^^
|
| 561 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 533, in compare_by_value
|
| 562 |
+
[rank4]: return ConstantVariable(op(a.value, b.value))
|
| 563 |
+
[rank4]: ^^^^^^^^^^^^^^^^^^^^
|
| 564 |
+
[rank4]: torch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'
|
| 565 |
+
|
| 566 |
+
[rank4]: from user code:
|
| 567 |
+
[rank4]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857
|
| 568 |
+
[rank4]: if window_size > 0:
|
| 569 |
+
|
| 570 |
+
[rank4]: Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
|
| 571 |
+
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/4/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/5/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 488, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 424, in forward\n outputs = self.model(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 294, in forward\n hidden_states, attentions, past_key_values = layer(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 108, in forward\n hidden_states = self.attn_norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 109, in torch_dynamo_resume_in_forward_at_108\n hidden_states, attentions, past_key_values = self.attn(\n ^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 108, in forward\n q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 123, in torch_dynamo_resume_in_forward_at_108\n o = parallel_nsa(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 838, in parallel_nsa\n o_cmp, lse_cmp = parallel_nsa_compression(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838\n o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1432, in __call__\n return self._torchdynamo_orig_callable(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1213, in __call__\n result = self._inner_convert(\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 598, in __call__\n return _compile(\n ^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1110, in _compile\n raise InternalTorchDynamoError(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1059, in _compile\n guarded_code = compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py\", line 97, in wrapper_function\n return function(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 761, in compile_inner\n return _compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 797, in _compile_inner\n out_code = transform_code_object(code, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py\", line 1422, in transform_code_object\n transformations(instructions, code_options)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 257, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 715, in transform\n tracer.run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 3498, in run\n super().run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1337, in run\n while self.step():\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1246, in step\n self.dispatch_table[inst.opcode](self, inst)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 2157, in COMPARE_OP\n self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 789, in <lambda>\n return lambda tx, args, kwargs: obj.call_function(\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 945, in builtin_dispatch\n rv = fn(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 839, in call_binop_handlers\n rv = fn(tx, *args)\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 533, in compare_by_value\n return ConstantVariable(op(a.value, b.value))\n ^^^^^^^^^^^^^^^^^^^^\ntorch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n\n", "timestamp": "1753352475"}}}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/5/stderr.log
ADDED
|
@@ -0,0 +1,571 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-24 18:19:06,977 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-24 18:19:06,977 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "bfloat16",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 8,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 2,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-24 18:19:06,978 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-24 18:19:07,669 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-24 18:19:07,672 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-24 18:19:07,731 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-24 18:19:07,731 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-24 18:19:07,731 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-24 18:19:07,745 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-24 18:19:07,847 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-24 18:19:07,847 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
|
| 146 |
+
[titan] 2025-07-24 18:19:07,962 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 147 |
+
IterableDataset({
|
| 148 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 149 |
+
num_shards: 140
|
| 150 |
+
})
|
| 151 |
+
[titan] 2025-07-24 18:19:07,962 - root - INFO - Shuffling the dataset with seed 42
|
| 152 |
+
[titan] 2025-07-24 18:19:07,962 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 153 |
+
[titan] 2025-07-24 18:20:11,070 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
|
| 154 |
+
IterableDataset({
|
| 155 |
+
features: ['repo', 'content'],
|
| 156 |
+
num_shards: 1
|
| 157 |
+
})
|
| 158 |
+
[titan] 2025-07-24 18:20:11,070 - root - INFO - Shuffling the dataset with seed 42
|
| 159 |
+
[titan] 2025-07-24 18:20:11,070 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 160 |
+
[titan] 2025-07-24 18:20:11,445 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
|
| 161 |
+
IterableDataset({
|
| 162 |
+
features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
|
| 163 |
+
num_shards: 100
|
| 164 |
+
})
|
| 165 |
+
[titan] 2025-07-24 18:20:11,445 - root - INFO - Shuffling the dataset with seed 42
|
| 166 |
+
[titan] 2025-07-24 18:20:11,445 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 167 |
+
[titan] 2025-07-24 18:20:18,139 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
|
| 168 |
+
[titan] 2025-07-24 18:20:18,885 - root - INFO - IterableDataset({
|
| 169 |
+
features: ['text', 'content'],
|
| 170 |
+
num_shards: 256
|
| 171 |
+
})
|
| 172 |
+
[titan] 2025-07-24 18:20:19,030 - root - INFO - Building dataloader...
|
| 173 |
+
[titan] 2025-07-24 18:20:19,033 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json
|
| 174 |
+
[titan] 2025-07-24 18:20:19,035 - root - INFO - Building model from the config
|
| 175 |
+
[32mGatedDeltaNetConfig {
|
| 176 |
+
"allow_neg_eigval": false,
|
| 177 |
+
"architectures": [
|
| 178 |
+
"GatedDeltaNetForCausalLM"
|
| 179 |
+
],
|
| 180 |
+
"attn": {
|
| 181 |
+
"block_counts": 16,
|
| 182 |
+
"block_size": 64,
|
| 183 |
+
"layers": [
|
| 184 |
+
5,
|
| 185 |
+
11,
|
| 186 |
+
17,
|
| 187 |
+
23
|
| 188 |
+
],
|
| 189 |
+
"num_heads": 32,
|
| 190 |
+
"num_kv_heads": 2,
|
| 191 |
+
"qkv_bias": false,
|
| 192 |
+
"rope_theta": 160000.0,
|
| 193 |
+
"type": "nsa",
|
| 194 |
+
"window_size": null
|
| 195 |
+
},
|
| 196 |
+
"attn_mode": "chunk",
|
| 197 |
+
"bos_token_id": 1,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"eos_token_id": 2,
|
| 200 |
+
"expand_k": 1,
|
| 201 |
+
"expand_v": 1,
|
| 202 |
+
"fuse_cross_entropy": true,
|
| 203 |
+
"fuse_norm": true,
|
| 204 |
+
"fuse_swiglu": true,
|
| 205 |
+
"head_dim": 256,
|
| 206 |
+
"hidden_act": "swish",
|
| 207 |
+
"hidden_ratio": 4,
|
| 208 |
+
"hidden_size": 1024,
|
| 209 |
+
"initializer_range": 0.02,
|
| 210 |
+
"intermediate_size": null,
|
| 211 |
+
"max_position_embeddings": 8192,
|
| 212 |
+
"model_type": "gated_deltanet",
|
| 213 |
+
"norm_eps": 1e-06,
|
| 214 |
+
"norm_first": false,
|
| 215 |
+
"num_heads": 4,
|
| 216 |
+
"num_hidden_layers": 24,
|
| 217 |
+
"num_v_heads": null,
|
| 218 |
+
"qk_activation": "silu",
|
| 219 |
+
"qk_norm": "l2",
|
| 220 |
+
"tie_word_embeddings": false,
|
| 221 |
+
"torch_dtype": "bfloat16",
|
| 222 |
+
"transformers_version": "4.53.3",
|
| 223 |
+
"use_beta": true,
|
| 224 |
+
"use_cache": true,
|
| 225 |
+
"use_gate": true,
|
| 226 |
+
"use_l2warp": false,
|
| 227 |
+
"use_output_norm": true,
|
| 228 |
+
"use_short_conv": true,
|
| 229 |
+
"vocab_size": 32000
|
| 230 |
+
}
|
| 231 |
+
[39m
|
| 232 |
+
[titan] 2025-07-24 18:20:19,380 - root - INFO - [34m
|
| 233 |
+
GatedDeltaNetForCausalLM(
|
| 234 |
+
(model): GatedDeltaNetModel(
|
| 235 |
+
(embeddings): Embedding(32000, 1024)
|
| 236 |
+
(layers): ModuleList(
|
| 237 |
+
(0-4): 5 x GatedDeltaNetBlock(
|
| 238 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 239 |
+
(attn): GatedDeltaNet(
|
| 240 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 241 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 242 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 243 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 244 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 245 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 246 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 247 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 248 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 249 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 250 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 251 |
+
)
|
| 252 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 253 |
+
(mlp): GatedMLP(
|
| 254 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 255 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 256 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 257 |
+
(swiglu_linear): SwiGLULinear()
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(5): GatedDeltaNetBlock(
|
| 261 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 262 |
+
(attn): NativeSparseAttention(
|
| 263 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 264 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 265 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 266 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 267 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 268 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 269 |
+
)
|
| 270 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 271 |
+
(mlp): GatedMLP(
|
| 272 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 273 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 274 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 275 |
+
(swiglu_linear): SwiGLULinear()
|
| 276 |
+
)
|
| 277 |
+
)
|
| 278 |
+
(6-10): 5 x GatedDeltaNetBlock(
|
| 279 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 280 |
+
(attn): GatedDeltaNet(
|
| 281 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 282 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 283 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 284 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 285 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 286 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 287 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 288 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 289 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 290 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 291 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 292 |
+
)
|
| 293 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 294 |
+
(mlp): GatedMLP(
|
| 295 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 296 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 297 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 298 |
+
(swiglu_linear): SwiGLULinear()
|
| 299 |
+
)
|
| 300 |
+
)
|
| 301 |
+
(11): GatedDeltaNetBlock(
|
| 302 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 303 |
+
(attn): NativeSparseAttention(
|
| 304 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 305 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 306 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 307 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 308 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 309 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 310 |
+
)
|
| 311 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 312 |
+
(mlp): GatedMLP(
|
| 313 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 314 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 315 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 316 |
+
(swiglu_linear): SwiGLULinear()
|
| 317 |
+
)
|
| 318 |
+
)
|
| 319 |
+
(12-16): 5 x GatedDeltaNetBlock(
|
| 320 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 321 |
+
(attn): GatedDeltaNet(
|
| 322 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 323 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 324 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 325 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 326 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 327 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 328 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 329 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 330 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 331 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 332 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 333 |
+
)
|
| 334 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 335 |
+
(mlp): GatedMLP(
|
| 336 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 337 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 338 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 339 |
+
(swiglu_linear): SwiGLULinear()
|
| 340 |
+
)
|
| 341 |
+
)
|
| 342 |
+
(17): GatedDeltaNetBlock(
|
| 343 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 344 |
+
(attn): NativeSparseAttention(
|
| 345 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 346 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 347 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 348 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 349 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 350 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 351 |
+
)
|
| 352 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 353 |
+
(mlp): GatedMLP(
|
| 354 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 355 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 356 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 357 |
+
(swiglu_linear): SwiGLULinear()
|
| 358 |
+
)
|
| 359 |
+
)
|
| 360 |
+
(18-22): 5 x GatedDeltaNetBlock(
|
| 361 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 362 |
+
(attn): GatedDeltaNet(
|
| 363 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 364 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 365 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 366 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 367 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 368 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 369 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 370 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 371 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 372 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 373 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 374 |
+
)
|
| 375 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 376 |
+
(mlp): GatedMLP(
|
| 377 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 378 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 379 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 380 |
+
(swiglu_linear): SwiGLULinear()
|
| 381 |
+
)
|
| 382 |
+
)
|
| 383 |
+
(23): GatedDeltaNetBlock(
|
| 384 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 385 |
+
(attn): NativeSparseAttention(
|
| 386 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 387 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 388 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 389 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 390 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 391 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 392 |
+
)
|
| 393 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 394 |
+
(mlp): GatedMLP(
|
| 395 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 396 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 397 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 398 |
+
(swiglu_linear): SwiGLULinear()
|
| 399 |
+
)
|
| 400 |
+
)
|
| 401 |
+
)
|
| 402 |
+
(norm): RMSNorm(1024, eps=1e-06)
|
| 403 |
+
)
|
| 404 |
+
(lm_head): Linear(in_features=1024, out_features=32000, bias=False)
|
| 405 |
+
(criterion): FusedLinearCrossEntropyLoss()
|
| 406 |
+
)[39m
|
| 407 |
+
|
| 408 |
+
[titan] 2025-07-24 18:20:19,418 - root - INFO - Compiling each block with torch.compile
|
| 409 |
+
[titan] 2025-07-24 18:20:19,418 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 410 |
+
[titan] 2025-07-24 18:20:19,419 - root - INFO - Compiling the entire model with torch.compile
|
| 411 |
+
[titan] 2025-07-24 18:20:19,503 - root - INFO - Applied FSDP to the model
|
| 412 |
+
[titan] 2025-07-24 18:20:19,588 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `A_log` is a DTensor, skipping initialization
|
| 413 |
+
[titan] 2025-07-24 18:20:19,588 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `dt_bias` is a DTensor, skipping initialization
|
| 414 |
+
[titan] 2025-07-24 18:20:19,684 - root - INFO - CUDA memory usage for model: 0.10GiB(0.10%)
|
| 415 |
+
[titan] 2025-07-24 18:20:19,686 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
|
| 416 |
+
[titan] 2025-07-24 18:20:19,829 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
|
| 417 |
+
[titan] 2025-07-24 18:20:19,830 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 418 |
+
[titan] 2025-07-24 18:20:19,888 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 419 |
+
[titan] 2025-07-24 18:20:28,483 - root - INFO - [31m***** Running training *****[39m
|
| 420 |
+
[titan] 2025-07-24 18:20:28,485 - root - INFO - [32m Training starts at step 1
|
| 421 |
+
[titan] 2025-07-24 18:20:28,486 - root - INFO - [32m Number of tokens per sequence = 8,192
|
| 422 |
+
[titan] 2025-07-24 18:20:28,486 - root - INFO - [32m Gradient Accumulation steps = 2
|
| 423 |
+
[titan] 2025-07-24 18:20:28,487 - root - INFO - [32m Instantaneous batch size (per device) = 8
|
| 424 |
+
[titan] 2025-07-24 18:20:28,487 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
|
| 425 |
+
[titan] 2025-07-24 18:20:28,487 - root - INFO - [32m Total optimization steps = 95,366 (99,998,498,816 tokens)
|
| 426 |
+
[titan] 2025-07-24 18:20:28,487 - root - INFO - [32m Warmup steps = 100 (104,857,600 tokens)
|
| 427 |
+
[titan] 2025-07-24 18:20:28,487 - root - INFO - [32m Number of parameters = 396,695,712 [39m
|
| 428 |
+
[titan] 2025-07-24 18:20:28,487 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
|
| 429 |
+
/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 430 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 431 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 432 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 433 |
+
[rank5]: Traceback (most recent call last):
|
| 434 |
+
[rank5]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 435 |
+
[rank5]: File "<frozen runpy>", line 88, in _run_code
|
| 436 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 616, in <module>
|
| 437 |
+
[rank5]: main(config)
|
| 438 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 439 |
+
[rank5]: return f(*args, **kwargs)
|
| 440 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^
|
| 441 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 488, in main
|
| 442 |
+
[rank5]: output = model(
|
| 443 |
+
[rank5]: ^^^^^^
|
| 444 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 445 |
+
[rank5]: return self._call_impl(*args, **kwargs)
|
| 446 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 447 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 448 |
+
[rank5]: return inner()
|
| 449 |
+
[rank5]: ^^^^^^^
|
| 450 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 451 |
+
[rank5]: result = forward_call(*args, **kwargs)
|
| 452 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 453 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
|
| 454 |
+
[rank5]: return func(*args, **kwargs)
|
| 455 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^
|
| 456 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 424, in forward
|
| 457 |
+
[rank5]: outputs = self.model(
|
| 458 |
+
[rank5]: ^^^^^^^^^^^
|
| 459 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 460 |
+
[rank5]: return self._call_impl(*args, **kwargs)
|
| 461 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 462 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 463 |
+
[rank5]: return forward_call(*args, **kwargs)
|
| 464 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 465 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 294, in forward
|
| 466 |
+
[rank5]: hidden_states, attentions, past_key_values = layer(
|
| 467 |
+
[rank5]: ^^^^^^
|
| 468 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 469 |
+
[rank5]: return self._call_impl(*args, **kwargs)
|
| 470 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 471 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 472 |
+
[rank5]: return inner()
|
| 473 |
+
[rank5]: ^^^^^^^
|
| 474 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 475 |
+
[rank5]: result = forward_call(*args, **kwargs)
|
| 476 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 477 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
|
| 478 |
+
[rank5]: return fn(*args, **kwargs)
|
| 479 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^
|
| 480 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 481 |
+
[rank5]: return self._call_impl(*args, **kwargs)
|
| 482 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 483 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 484 |
+
[rank5]: return forward_call(*args, **kwargs)
|
| 485 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 486 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 108, in forward
|
| 487 |
+
[rank5]: hidden_states = self.attn_norm(hidden_states)
|
| 488 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 109, in torch_dynamo_resume_in_forward_at_108
|
| 489 |
+
[rank5]: hidden_states, attentions, past_key_values = self.attn(
|
| 490 |
+
[rank5]: ^^^^^^^^^^
|
| 491 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 492 |
+
[rank5]: return self._call_impl(*args, **kwargs)
|
| 493 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 494 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 495 |
+
[rank5]: return forward_call(*args, **kwargs)
|
| 496 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 497 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 108, in forward
|
| 498 |
+
[rank5]: q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)
|
| 499 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 123, in torch_dynamo_resume_in_forward_at_108
|
| 500 |
+
[rank5]: o = parallel_nsa(
|
| 501 |
+
[rank5]: ^^^^^^^^^^^^^
|
| 502 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 838, in parallel_nsa
|
| 503 |
+
[rank5]: o_cmp, lse_cmp = parallel_nsa_compression(
|
| 504 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838
|
| 505 |
+
[rank5]: o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)
|
| 506 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1432, in __call__
|
| 507 |
+
[rank5]: return self._torchdynamo_orig_callable(
|
| 508 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 509 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1213, in __call__
|
| 510 |
+
[rank5]: result = self._inner_convert(
|
| 511 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^
|
| 512 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 598, in __call__
|
| 513 |
+
[rank5]: return _compile(
|
| 514 |
+
[rank5]: ^^^^^^^^^
|
| 515 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1110, in _compile
|
| 516 |
+
[rank5]: raise InternalTorchDynamoError(
|
| 517 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1059, in _compile
|
| 518 |
+
[rank5]: guarded_code = compile_inner(code, one_graph, hooks, transform)
|
| 519 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 520 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py", line 97, in wrapper_function
|
| 521 |
+
[rank5]: return function(*args, **kwargs)
|
| 522 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 523 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 761, in compile_inner
|
| 524 |
+
[rank5]: return _compile_inner(code, one_graph, hooks, transform)
|
| 525 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 526 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 797, in _compile_inner
|
| 527 |
+
[rank5]: out_code = transform_code_object(code, transform)
|
| 528 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 529 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py", line 1422, in transform_code_object
|
| 530 |
+
[rank5]: transformations(instructions, code_options)
|
| 531 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 257, in _fn
|
| 532 |
+
[rank5]: return fn(*args, **kwargs)
|
| 533 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^
|
| 534 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 715, in transform
|
| 535 |
+
[rank5]: tracer.run()
|
| 536 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 3498, in run
|
| 537 |
+
[rank5]: super().run()
|
| 538 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1337, in run
|
| 539 |
+
[rank5]: while self.step():
|
| 540 |
+
[rank5]: ^^^^^^^^^^^
|
| 541 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1246, in step
|
| 542 |
+
[rank5]: self.dispatch_table[inst.opcode](self, inst)
|
| 543 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 2157, in COMPARE_OP
|
| 544 |
+
[rank5]: self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))
|
| 545 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 546 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 547 |
+
[rank5]: return handler(tx, args, kwargs)
|
| 548 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 549 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 789, in <lambda>
|
| 550 |
+
[rank5]: return lambda tx, args, kwargs: obj.call_function(
|
| 551 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^
|
| 552 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 553 |
+
[rank5]: return handler(tx, args, kwargs)
|
| 554 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 555 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 945, in builtin_dispatch
|
| 556 |
+
[rank5]: rv = fn(tx, args, kwargs)
|
| 557 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^
|
| 558 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 839, in call_binop_handlers
|
| 559 |
+
[rank5]: rv = fn(tx, *args)
|
| 560 |
+
[rank5]: ^^^^^^^^^^^^^
|
| 561 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 533, in compare_by_value
|
| 562 |
+
[rank5]: return ConstantVariable(op(a.value, b.value))
|
| 563 |
+
[rank5]: ^^^^^^^^^^^^^^^^^^^^
|
| 564 |
+
[rank5]: torch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'
|
| 565 |
+
|
| 566 |
+
[rank5]: from user code:
|
| 567 |
+
[rank5]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857
|
| 568 |
+
[rank5]: if window_size > 0:
|
| 569 |
+
|
| 570 |
+
[rank5]: Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
|
| 571 |
+
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/5/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/6/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 488, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 424, in forward\n outputs = self.model(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 294, in forward\n hidden_states, attentions, past_key_values = layer(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 108, in forward\n hidden_states = self.attn_norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 109, in torch_dynamo_resume_in_forward_at_108\n hidden_states, attentions, past_key_values = self.attn(\n ^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 108, in forward\n q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 123, in torch_dynamo_resume_in_forward_at_108\n o = parallel_nsa(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 838, in parallel_nsa\n o_cmp, lse_cmp = parallel_nsa_compression(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838\n o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1432, in __call__\n return self._torchdynamo_orig_callable(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1213, in __call__\n result = self._inner_convert(\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 598, in __call__\n return _compile(\n ^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1110, in _compile\n raise InternalTorchDynamoError(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1059, in _compile\n guarded_code = compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py\", line 97, in wrapper_function\n return function(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 761, in compile_inner\n return _compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 797, in _compile_inner\n out_code = transform_code_object(code, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py\", line 1422, in transform_code_object\n transformations(instructions, code_options)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 257, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 715, in transform\n tracer.run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 3498, in run\n super().run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1337, in run\n while self.step():\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1246, in step\n self.dispatch_table[inst.opcode](self, inst)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 2157, in COMPARE_OP\n self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 789, in <lambda>\n return lambda tx, args, kwargs: obj.call_function(\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 945, in builtin_dispatch\n rv = fn(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 839, in call_binop_handlers\n rv = fn(tx, *args)\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 533, in compare_by_value\n return ConstantVariable(op(a.value, b.value))\n ^^^^^^^^^^^^^^^^^^^^\ntorch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n\n", "timestamp": "1753352474"}}}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/6/stderr.log
ADDED
|
@@ -0,0 +1,571 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-24 18:19:07,121 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-24 18:19:07,121 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "bfloat16",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 8,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 2,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-24 18:19:07,122 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-24 18:19:07,697 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-24 18:19:07,700 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-24 18:19:07,786 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-24 18:19:07,786 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-24 18:19:07,786 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-24 18:19:07,848 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-24 18:19:07,970 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-24 18:19:07,970 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
|
| 146 |
+
[titan] 2025-07-24 18:19:08,091 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 147 |
+
IterableDataset({
|
| 148 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 149 |
+
num_shards: 140
|
| 150 |
+
})
|
| 151 |
+
[titan] 2025-07-24 18:19:08,091 - root - INFO - Shuffling the dataset with seed 42
|
| 152 |
+
[titan] 2025-07-24 18:19:08,091 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 153 |
+
[titan] 2025-07-24 18:20:06,911 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
|
| 154 |
+
IterableDataset({
|
| 155 |
+
features: ['repo', 'content'],
|
| 156 |
+
num_shards: 1
|
| 157 |
+
})
|
| 158 |
+
[titan] 2025-07-24 18:20:06,912 - root - INFO - Shuffling the dataset with seed 42
|
| 159 |
+
[titan] 2025-07-24 18:20:06,912 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 160 |
+
[titan] 2025-07-24 18:20:07,259 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
|
| 161 |
+
IterableDataset({
|
| 162 |
+
features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
|
| 163 |
+
num_shards: 100
|
| 164 |
+
})
|
| 165 |
+
[titan] 2025-07-24 18:20:07,259 - root - INFO - Shuffling the dataset with seed 42
|
| 166 |
+
[titan] 2025-07-24 18:20:07,259 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 167 |
+
[titan] 2025-07-24 18:20:13,871 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
|
| 168 |
+
[titan] 2025-07-24 18:20:14,713 - root - INFO - IterableDataset({
|
| 169 |
+
features: ['text', 'content'],
|
| 170 |
+
num_shards: 256
|
| 171 |
+
})
|
| 172 |
+
[titan] 2025-07-24 18:20:14,891 - root - INFO - Building dataloader...
|
| 173 |
+
[titan] 2025-07-24 18:20:14,893 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json
|
| 174 |
+
[titan] 2025-07-24 18:20:14,895 - root - INFO - Building model from the config
|
| 175 |
+
[32mGatedDeltaNetConfig {
|
| 176 |
+
"allow_neg_eigval": false,
|
| 177 |
+
"architectures": [
|
| 178 |
+
"GatedDeltaNetForCausalLM"
|
| 179 |
+
],
|
| 180 |
+
"attn": {
|
| 181 |
+
"block_counts": 16,
|
| 182 |
+
"block_size": 64,
|
| 183 |
+
"layers": [
|
| 184 |
+
5,
|
| 185 |
+
11,
|
| 186 |
+
17,
|
| 187 |
+
23
|
| 188 |
+
],
|
| 189 |
+
"num_heads": 32,
|
| 190 |
+
"num_kv_heads": 2,
|
| 191 |
+
"qkv_bias": false,
|
| 192 |
+
"rope_theta": 160000.0,
|
| 193 |
+
"type": "nsa",
|
| 194 |
+
"window_size": null
|
| 195 |
+
},
|
| 196 |
+
"attn_mode": "chunk",
|
| 197 |
+
"bos_token_id": 1,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"eos_token_id": 2,
|
| 200 |
+
"expand_k": 1,
|
| 201 |
+
"expand_v": 1,
|
| 202 |
+
"fuse_cross_entropy": true,
|
| 203 |
+
"fuse_norm": true,
|
| 204 |
+
"fuse_swiglu": true,
|
| 205 |
+
"head_dim": 256,
|
| 206 |
+
"hidden_act": "swish",
|
| 207 |
+
"hidden_ratio": 4,
|
| 208 |
+
"hidden_size": 1024,
|
| 209 |
+
"initializer_range": 0.02,
|
| 210 |
+
"intermediate_size": null,
|
| 211 |
+
"max_position_embeddings": 8192,
|
| 212 |
+
"model_type": "gated_deltanet",
|
| 213 |
+
"norm_eps": 1e-06,
|
| 214 |
+
"norm_first": false,
|
| 215 |
+
"num_heads": 4,
|
| 216 |
+
"num_hidden_layers": 24,
|
| 217 |
+
"num_v_heads": null,
|
| 218 |
+
"qk_activation": "silu",
|
| 219 |
+
"qk_norm": "l2",
|
| 220 |
+
"tie_word_embeddings": false,
|
| 221 |
+
"torch_dtype": "bfloat16",
|
| 222 |
+
"transformers_version": "4.53.3",
|
| 223 |
+
"use_beta": true,
|
| 224 |
+
"use_cache": true,
|
| 225 |
+
"use_gate": true,
|
| 226 |
+
"use_l2warp": false,
|
| 227 |
+
"use_output_norm": true,
|
| 228 |
+
"use_short_conv": true,
|
| 229 |
+
"vocab_size": 32000
|
| 230 |
+
}
|
| 231 |
+
[39m
|
| 232 |
+
[titan] 2025-07-24 18:20:15,243 - root - INFO - [34m
|
| 233 |
+
GatedDeltaNetForCausalLM(
|
| 234 |
+
(model): GatedDeltaNetModel(
|
| 235 |
+
(embeddings): Embedding(32000, 1024)
|
| 236 |
+
(layers): ModuleList(
|
| 237 |
+
(0-4): 5 x GatedDeltaNetBlock(
|
| 238 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 239 |
+
(attn): GatedDeltaNet(
|
| 240 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 241 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 242 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 243 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 244 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 245 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 246 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 247 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 248 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 249 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 250 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 251 |
+
)
|
| 252 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 253 |
+
(mlp): GatedMLP(
|
| 254 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 255 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 256 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 257 |
+
(swiglu_linear): SwiGLULinear()
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(5): GatedDeltaNetBlock(
|
| 261 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 262 |
+
(attn): NativeSparseAttention(
|
| 263 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 264 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 265 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 266 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 267 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 268 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 269 |
+
)
|
| 270 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 271 |
+
(mlp): GatedMLP(
|
| 272 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 273 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 274 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 275 |
+
(swiglu_linear): SwiGLULinear()
|
| 276 |
+
)
|
| 277 |
+
)
|
| 278 |
+
(6-10): 5 x GatedDeltaNetBlock(
|
| 279 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 280 |
+
(attn): GatedDeltaNet(
|
| 281 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 282 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 283 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 284 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 285 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 286 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 287 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 288 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 289 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 290 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 291 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 292 |
+
)
|
| 293 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 294 |
+
(mlp): GatedMLP(
|
| 295 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 296 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 297 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 298 |
+
(swiglu_linear): SwiGLULinear()
|
| 299 |
+
)
|
| 300 |
+
)
|
| 301 |
+
(11): GatedDeltaNetBlock(
|
| 302 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 303 |
+
(attn): NativeSparseAttention(
|
| 304 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 305 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 306 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 307 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 308 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 309 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 310 |
+
)
|
| 311 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 312 |
+
(mlp): GatedMLP(
|
| 313 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 314 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 315 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 316 |
+
(swiglu_linear): SwiGLULinear()
|
| 317 |
+
)
|
| 318 |
+
)
|
| 319 |
+
(12-16): 5 x GatedDeltaNetBlock(
|
| 320 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 321 |
+
(attn): GatedDeltaNet(
|
| 322 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 323 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 324 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 325 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 326 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 327 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 328 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 329 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 330 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 331 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 332 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 333 |
+
)
|
| 334 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 335 |
+
(mlp): GatedMLP(
|
| 336 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 337 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 338 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 339 |
+
(swiglu_linear): SwiGLULinear()
|
| 340 |
+
)
|
| 341 |
+
)
|
| 342 |
+
(17): GatedDeltaNetBlock(
|
| 343 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 344 |
+
(attn): NativeSparseAttention(
|
| 345 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 346 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 347 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 348 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 349 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 350 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 351 |
+
)
|
| 352 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 353 |
+
(mlp): GatedMLP(
|
| 354 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 355 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 356 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 357 |
+
(swiglu_linear): SwiGLULinear()
|
| 358 |
+
)
|
| 359 |
+
)
|
| 360 |
+
(18-22): 5 x GatedDeltaNetBlock(
|
| 361 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 362 |
+
(attn): GatedDeltaNet(
|
| 363 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 364 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 365 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 366 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 367 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 368 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 369 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 370 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 371 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 372 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 373 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 374 |
+
)
|
| 375 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 376 |
+
(mlp): GatedMLP(
|
| 377 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 378 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 379 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 380 |
+
(swiglu_linear): SwiGLULinear()
|
| 381 |
+
)
|
| 382 |
+
)
|
| 383 |
+
(23): GatedDeltaNetBlock(
|
| 384 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 385 |
+
(attn): NativeSparseAttention(
|
| 386 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 387 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 388 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 389 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 390 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 391 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 392 |
+
)
|
| 393 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 394 |
+
(mlp): GatedMLP(
|
| 395 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 396 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 397 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 398 |
+
(swiglu_linear): SwiGLULinear()
|
| 399 |
+
)
|
| 400 |
+
)
|
| 401 |
+
)
|
| 402 |
+
(norm): RMSNorm(1024, eps=1e-06)
|
| 403 |
+
)
|
| 404 |
+
(lm_head): Linear(in_features=1024, out_features=32000, bias=False)
|
| 405 |
+
(criterion): FusedLinearCrossEntropyLoss()
|
| 406 |
+
)[39m
|
| 407 |
+
|
| 408 |
+
[titan] 2025-07-24 18:20:15,278 - root - INFO - Compiling each block with torch.compile
|
| 409 |
+
[titan] 2025-07-24 18:20:15,278 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 410 |
+
[titan] 2025-07-24 18:20:15,279 - root - INFO - Compiling the entire model with torch.compile
|
| 411 |
+
[titan] 2025-07-24 18:20:15,360 - root - INFO - Applied FSDP to the model
|
| 412 |
+
[titan] 2025-07-24 18:20:15,425 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `A_log` is a DTensor, skipping initialization
|
| 413 |
+
[titan] 2025-07-24 18:20:15,426 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `dt_bias` is a DTensor, skipping initialization
|
| 414 |
+
[titan] 2025-07-24 18:20:15,519 - root - INFO - CUDA memory usage for model: 0.10GiB(0.10%)
|
| 415 |
+
[titan] 2025-07-24 18:20:15,520 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
|
| 416 |
+
[titan] 2025-07-24 18:20:15,544 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
|
| 417 |
+
[titan] 2025-07-24 18:20:15,544 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 418 |
+
[titan] 2025-07-24 18:20:15,603 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 419 |
+
[titan] 2025-07-24 18:20:24,033 - root - INFO - [31m***** Running training *****[39m
|
| 420 |
+
[titan] 2025-07-24 18:20:24,036 - root - INFO - [32m Training starts at step 1
|
| 421 |
+
[titan] 2025-07-24 18:20:24,036 - root - INFO - [32m Number of tokens per sequence = 8,192
|
| 422 |
+
[titan] 2025-07-24 18:20:24,037 - root - INFO - [32m Gradient Accumulation steps = 2
|
| 423 |
+
[titan] 2025-07-24 18:20:24,037 - root - INFO - [32m Instantaneous batch size (per device) = 8
|
| 424 |
+
[titan] 2025-07-24 18:20:24,037 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
|
| 425 |
+
[titan] 2025-07-24 18:20:24,037 - root - INFO - [32m Total optimization steps = 95,366 (99,998,498,816 tokens)
|
| 426 |
+
[titan] 2025-07-24 18:20:24,037 - root - INFO - [32m Warmup steps = 100 (104,857,600 tokens)
|
| 427 |
+
[titan] 2025-07-24 18:20:24,037 - root - INFO - [32m Number of parameters = 396,695,712 [39m
|
| 428 |
+
[titan] 2025-07-24 18:20:24,037 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
|
| 429 |
+
/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 430 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 431 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 432 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 433 |
+
[rank6]: Traceback (most recent call last):
|
| 434 |
+
[rank6]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 435 |
+
[rank6]: File "<frozen runpy>", line 88, in _run_code
|
| 436 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 616, in <module>
|
| 437 |
+
[rank6]: main(config)
|
| 438 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 439 |
+
[rank6]: return f(*args, **kwargs)
|
| 440 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^
|
| 441 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 488, in main
|
| 442 |
+
[rank6]: output = model(
|
| 443 |
+
[rank6]: ^^^^^^
|
| 444 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 445 |
+
[rank6]: return self._call_impl(*args, **kwargs)
|
| 446 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 447 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 448 |
+
[rank6]: return inner()
|
| 449 |
+
[rank6]: ^^^^^^^
|
| 450 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 451 |
+
[rank6]: result = forward_call(*args, **kwargs)
|
| 452 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 453 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
|
| 454 |
+
[rank6]: return func(*args, **kwargs)
|
| 455 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^
|
| 456 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 424, in forward
|
| 457 |
+
[rank6]: outputs = self.model(
|
| 458 |
+
[rank6]: ^^^^^^^^^^^
|
| 459 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 460 |
+
[rank6]: return self._call_impl(*args, **kwargs)
|
| 461 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 462 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 463 |
+
[rank6]: return forward_call(*args, **kwargs)
|
| 464 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 465 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 294, in forward
|
| 466 |
+
[rank6]: hidden_states, attentions, past_key_values = layer(
|
| 467 |
+
[rank6]: ^^^^^^
|
| 468 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 469 |
+
[rank6]: return self._call_impl(*args, **kwargs)
|
| 470 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 471 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 472 |
+
[rank6]: return inner()
|
| 473 |
+
[rank6]: ^^^^^^^
|
| 474 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 475 |
+
[rank6]: result = forward_call(*args, **kwargs)
|
| 476 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 477 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
|
| 478 |
+
[rank6]: return fn(*args, **kwargs)
|
| 479 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^
|
| 480 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 481 |
+
[rank6]: return self._call_impl(*args, **kwargs)
|
| 482 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 483 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 484 |
+
[rank6]: return forward_call(*args, **kwargs)
|
| 485 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 486 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 108, in forward
|
| 487 |
+
[rank6]: hidden_states = self.attn_norm(hidden_states)
|
| 488 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 109, in torch_dynamo_resume_in_forward_at_108
|
| 489 |
+
[rank6]: hidden_states, attentions, past_key_values = self.attn(
|
| 490 |
+
[rank6]: ^^^^^^^^^^
|
| 491 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 492 |
+
[rank6]: return self._call_impl(*args, **kwargs)
|
| 493 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 494 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 495 |
+
[rank6]: return forward_call(*args, **kwargs)
|
| 496 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 497 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 108, in forward
|
| 498 |
+
[rank6]: q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)
|
| 499 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 123, in torch_dynamo_resume_in_forward_at_108
|
| 500 |
+
[rank6]: o = parallel_nsa(
|
| 501 |
+
[rank6]: ^^^^^^^^^^^^^
|
| 502 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 838, in parallel_nsa
|
| 503 |
+
[rank6]: o_cmp, lse_cmp = parallel_nsa_compression(
|
| 504 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838
|
| 505 |
+
[rank6]: o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)
|
| 506 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1432, in __call__
|
| 507 |
+
[rank6]: return self._torchdynamo_orig_callable(
|
| 508 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 509 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1213, in __call__
|
| 510 |
+
[rank6]: result = self._inner_convert(
|
| 511 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^
|
| 512 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 598, in __call__
|
| 513 |
+
[rank6]: return _compile(
|
| 514 |
+
[rank6]: ^^^^^^^^^
|
| 515 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1110, in _compile
|
| 516 |
+
[rank6]: raise InternalTorchDynamoError(
|
| 517 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1059, in _compile
|
| 518 |
+
[rank6]: guarded_code = compile_inner(code, one_graph, hooks, transform)
|
| 519 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 520 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py", line 97, in wrapper_function
|
| 521 |
+
[rank6]: return function(*args, **kwargs)
|
| 522 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 523 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 761, in compile_inner
|
| 524 |
+
[rank6]: return _compile_inner(code, one_graph, hooks, transform)
|
| 525 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 526 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 797, in _compile_inner
|
| 527 |
+
[rank6]: out_code = transform_code_object(code, transform)
|
| 528 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 529 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py", line 1422, in transform_code_object
|
| 530 |
+
[rank6]: transformations(instructions, code_options)
|
| 531 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 257, in _fn
|
| 532 |
+
[rank6]: return fn(*args, **kwargs)
|
| 533 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^
|
| 534 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 715, in transform
|
| 535 |
+
[rank6]: tracer.run()
|
| 536 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 3498, in run
|
| 537 |
+
[rank6]: super().run()
|
| 538 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1337, in run
|
| 539 |
+
[rank6]: while self.step():
|
| 540 |
+
[rank6]: ^^^^^^^^^^^
|
| 541 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1246, in step
|
| 542 |
+
[rank6]: self.dispatch_table[inst.opcode](self, inst)
|
| 543 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 2157, in COMPARE_OP
|
| 544 |
+
[rank6]: self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))
|
| 545 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 546 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 547 |
+
[rank6]: return handler(tx, args, kwargs)
|
| 548 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 549 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 789, in <lambda>
|
| 550 |
+
[rank6]: return lambda tx, args, kwargs: obj.call_function(
|
| 551 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^
|
| 552 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 553 |
+
[rank6]: return handler(tx, args, kwargs)
|
| 554 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 555 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 945, in builtin_dispatch
|
| 556 |
+
[rank6]: rv = fn(tx, args, kwargs)
|
| 557 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^
|
| 558 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 839, in call_binop_handlers
|
| 559 |
+
[rank6]: rv = fn(tx, *args)
|
| 560 |
+
[rank6]: ^^^^^^^^^^^^^
|
| 561 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 533, in compare_by_value
|
| 562 |
+
[rank6]: return ConstantVariable(op(a.value, b.value))
|
| 563 |
+
[rank6]: ^^^^^^^^^^^^^^^^^^^^
|
| 564 |
+
[rank6]: torch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'
|
| 565 |
+
|
| 566 |
+
[rank6]: from user code:
|
| 567 |
+
[rank6]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857
|
| 568 |
+
[rank6]: if window_size > 0:
|
| 569 |
+
|
| 570 |
+
[rank6]: Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
|
| 571 |
+
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/6/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/7/error.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"message": {"message": "InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n", "extraInfo": {"py_callstack": "Traceback (most recent call last):\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py\", line 355, in wrapper\n return f(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py\", line 488, in main\n output = model(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py\", line 172, in wrapped_func\n return func(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 424, in forward\n outputs = self.model(\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 294, in forward\n hidden_states, attentions, past_key_values = layer(\n ^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1857, in _call_impl\n return inner()\n ^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1805, in inner\n result = forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py\", line 655, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 108, in forward\n hidden_states = self.attn_norm(hidden_states)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py\", line 109, in torch_dynamo_resume_in_forward_at_108\n hidden_states, attentions, past_key_values = self.attn(\n ^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1751, in _wrapped_call_impl\n return self._call_impl(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py\", line 1762, in _call_impl\n return forward_call(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 108, in forward\n q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py\", line 123, in torch_dynamo_resume_in_forward_at_108\n o = parallel_nsa(\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 838, in parallel_nsa\n o_cmp, lse_cmp = parallel_nsa_compression(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838\n o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1432, in __call__\n return self._torchdynamo_orig_callable(\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1213, in __call__\n result = self._inner_convert(\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 598, in __call__\n return _compile(\n ^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1110, in _compile\n raise InternalTorchDynamoError(\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 1059, in _compile\n guarded_code = compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py\", line 97, in wrapper_function\n return function(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 761, in compile_inner\n return _compile_inner(code, one_graph, hooks, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 797, in _compile_inner\n out_code = transform_code_object(code, transform)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py\", line 1422, in transform_code_object\n transformations(instructions, code_options)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 257, in _fn\n return fn(*args, **kwargs)\n ^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py\", line 715, in transform\n tracer.run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 3498, in run\n super().run()\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1337, in run\n while self.step():\n ^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 1246, in step\n self.dispatch_table[inst.opcode](self, inst)\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py\", line 2157, in COMPARE_OP\n self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 789, in <lambda>\n return lambda tx, args, kwargs: obj.call_function(\n ^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 1111, in call_function\n return handler(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 945, in builtin_dispatch\n rv = fn(tx, args, kwargs)\n ^^^^^^^^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 839, in call_binop_handlers\n rv = fn(tx, *args)\n ^^^^^^^^^^^^^\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py\", line 533, in compare_by_value\n return ConstantVariable(op(a.value, b.value))\n ^^^^^^^^^^^^^^^^^^^^\ntorch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'\n\nfrom user code:\n File \"/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py\", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857\n if window_size > 0:\n\nSet TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS=\"+dynamo\"\n\n", "timestamp": "1753352475"}}}
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/7/stderr.log
ADDED
|
@@ -0,0 +1,571 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[titan] 2025-07-24 18:19:06,088 - root - INFO - Starting job: default job
|
| 2 |
+
[titan] 2025-07-24 18:19:06,088 - root - INFO - [32m{
|
| 3 |
+
"activation_checkpoint": {
|
| 4 |
+
"mode": "none",
|
| 5 |
+
"selective_ac_option": "2"
|
| 6 |
+
},
|
| 7 |
+
"activation_offload": {
|
| 8 |
+
"mode": "none"
|
| 9 |
+
},
|
| 10 |
+
"checkpoint": {
|
| 11 |
+
"async_mode": "disabled",
|
| 12 |
+
"create_seed_checkpoint": false,
|
| 13 |
+
"enable_checkpoint": true,
|
| 14 |
+
"exclude_from_loading": [],
|
| 15 |
+
"export_dtype": "bfloat16",
|
| 16 |
+
"folder": "checkpoint",
|
| 17 |
+
"interval": 8192,
|
| 18 |
+
"interval_type": "steps",
|
| 19 |
+
"keep_latest_k": 100,
|
| 20 |
+
"load_step": -1,
|
| 21 |
+
"model_weights_only": false
|
| 22 |
+
},
|
| 23 |
+
"comm": {
|
| 24 |
+
"init_timeout_seconds": 300,
|
| 25 |
+
"trace_buf_size": 20000,
|
| 26 |
+
"train_timeout_seconds": 100
|
| 27 |
+
},
|
| 28 |
+
"experimental": {
|
| 29 |
+
"context_parallel_degree": 1,
|
| 30 |
+
"context_parallel_rotate_method": "allgather",
|
| 31 |
+
"custom_model_path": "",
|
| 32 |
+
"enable_async_tensor_parallel": false,
|
| 33 |
+
"enable_compiled_autograd": false,
|
| 34 |
+
"pipeline_parallel_degree": 1,
|
| 35 |
+
"pipeline_parallel_microbatches": null,
|
| 36 |
+
"pipeline_parallel_schedule": "1F1B",
|
| 37 |
+
"pipeline_parallel_schedule_csv": "",
|
| 38 |
+
"pipeline_parallel_split_points": []
|
| 39 |
+
},
|
| 40 |
+
"fault_tolerance": {
|
| 41 |
+
"enable": false,
|
| 42 |
+
"group_size": 0,
|
| 43 |
+
"min_replica_size": 1,
|
| 44 |
+
"replica_id": 0
|
| 45 |
+
},
|
| 46 |
+
"float8": {
|
| 47 |
+
"enable_fsdp_float8_all_gather": false,
|
| 48 |
+
"force_recompute_fp8_weight_in_bwd": false,
|
| 49 |
+
"precompute_float8_dynamic_scale_for_fsdp": false,
|
| 50 |
+
"recipe_name": null
|
| 51 |
+
},
|
| 52 |
+
"job": {
|
| 53 |
+
"config_file": "flame/models/fla.toml",
|
| 54 |
+
"description": "default job",
|
| 55 |
+
"dump_folder": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2",
|
| 56 |
+
"print_args": true,
|
| 57 |
+
"use_for_integration_test": false
|
| 58 |
+
},
|
| 59 |
+
"lr_scheduler": {
|
| 60 |
+
"decay_ratio": 1.0,
|
| 61 |
+
"decay_type": "linear",
|
| 62 |
+
"lr_min": 0.01,
|
| 63 |
+
"warmup_steps": 100
|
| 64 |
+
},
|
| 65 |
+
"memory_estimation": {
|
| 66 |
+
"disable_fake_mode": false,
|
| 67 |
+
"enabled": false
|
| 68 |
+
},
|
| 69 |
+
"metrics": {
|
| 70 |
+
"disable_color_printing": false,
|
| 71 |
+
"enable_tensorboard": true,
|
| 72 |
+
"enable_wandb": true,
|
| 73 |
+
"log_freq": 1,
|
| 74 |
+
"save_for_all_ranks": false,
|
| 75 |
+
"save_tb_folder": "tb"
|
| 76 |
+
},
|
| 77 |
+
"model": {
|
| 78 |
+
"config": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json",
|
| 79 |
+
"converters": [],
|
| 80 |
+
"name": "fla",
|
| 81 |
+
"print_after_conversion": false,
|
| 82 |
+
"tokenizer_path": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer"
|
| 83 |
+
},
|
| 84 |
+
"optimizer": {
|
| 85 |
+
"early_step_in_backward": false,
|
| 86 |
+
"eps": 1e-08,
|
| 87 |
+
"implementation": "fused",
|
| 88 |
+
"lr": 0.0003,
|
| 89 |
+
"name": "AdamW"
|
| 90 |
+
},
|
| 91 |
+
"profiling": {
|
| 92 |
+
"enable_memory_snapshot": false,
|
| 93 |
+
"enable_profiling": true,
|
| 94 |
+
"profile_freq": 512,
|
| 95 |
+
"save_memory_snapshot_folder": "memory_snapshot",
|
| 96 |
+
"save_traces_folder": "profile_trace"
|
| 97 |
+
},
|
| 98 |
+
"training": {
|
| 99 |
+
"batch_size": 8,
|
| 100 |
+
"compile": true,
|
| 101 |
+
"context_len": 8192,
|
| 102 |
+
"data_dir": null,
|
| 103 |
+
"data_files": null,
|
| 104 |
+
"data_parallel_replicate_degree": 1,
|
| 105 |
+
"data_parallel_shard_degree": -1,
|
| 106 |
+
"data_probs": "0.55,0.3,0.15",
|
| 107 |
+
"dataset": "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro",
|
| 108 |
+
"dataset_name": "default,default,default",
|
| 109 |
+
"dataset_split": "train,train,train",
|
| 110 |
+
"deterministic": false,
|
| 111 |
+
"disable_loss_parallel": false,
|
| 112 |
+
"enable_cpu_offload": false,
|
| 113 |
+
"fsdp_reshard_after_forward": "default",
|
| 114 |
+
"gc_freq": 50,
|
| 115 |
+
"gradient_accumulation_steps": 2,
|
| 116 |
+
"max_norm": 1.0,
|
| 117 |
+
"mixed_precision_param": "bfloat16",
|
| 118 |
+
"mixed_precision_reduce": "float32",
|
| 119 |
+
"num_workers": 32,
|
| 120 |
+
"persistent_workers": false,
|
| 121 |
+
"pin_memory": false,
|
| 122 |
+
"prefetch_factor": 2,
|
| 123 |
+
"seed": 42,
|
| 124 |
+
"seq_len": 8192,
|
| 125 |
+
"skip_nan_inf": true,
|
| 126 |
+
"steps": 95366,
|
| 127 |
+
"streaming": true,
|
| 128 |
+
"tensor_parallel_degree": 1,
|
| 129 |
+
"varlen": false
|
| 130 |
+
}
|
| 131 |
+
}[39m
|
| 132 |
+
[titan] 2025-07-24 18:19:06,089 - root - INFO - [GC] Initial GC collection. 0.00 seconds.
|
| 133 |
+
[titan] 2025-07-24 18:19:06,815 - root - WARNING - ENV[TORCH_NCCL_ASYNC_ERROR_HANDLING] = 1 will be overridden to 3 based on job config
|
| 134 |
+
[titan] 2025-07-24 18:19:06,817 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 135 |
+
[titan] 2025-07-24 18:19:06,874 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 136 |
+
[titan] 2025-07-24 18:19:06,874 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
|
| 137 |
+
[titan] 2025-07-24 18:19:06,875 - root - INFO - Building 1-D device mesh with ['dp_shard'], [8]
|
| 138 |
+
[titan] 2025-07-24 18:19:06,881 - root - INFO - Loading tokenizer...
|
| 139 |
+
[titan] 2025-07-24 18:19:06,985 - root - INFO - LlamaTokenizerFast(name_or_path='/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/tokenizer', vocab_size=32000, model_max_length=10000000000, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
|
| 140 |
+
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 141 |
+
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 142 |
+
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
[titan] 2025-07-24 18:19:06,986 - root - INFO - Loading dataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged,/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default,default,default
|
| 146 |
+
[titan] 2025-07-24 18:19:07,101 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample:default (p = 0.550)[39m:
|
| 147 |
+
IterableDataset({
|
| 148 |
+
features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
|
| 149 |
+
num_shards: 140
|
| 150 |
+
})
|
| 151 |
+
[titan] 2025-07-24 18:19:07,101 - root - INFO - Shuffling the dataset with seed 42
|
| 152 |
+
[titan] 2025-07-24 18:19:07,101 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/fineweb-edu-sample has insufficient shards (140). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 153 |
+
[titan] 2025-07-24 18:20:11,991 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged:default (p = 0.300)[39m:
|
| 154 |
+
IterableDataset({
|
| 155 |
+
features: ['repo', 'content'],
|
| 156 |
+
num_shards: 1
|
| 157 |
+
})
|
| 158 |
+
[titan] 2025-07-24 18:20:11,991 - root - INFO - Shuffling the dataset with seed 42
|
| 159 |
+
[titan] 2025-07-24 18:20:11,992 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/small_repos_20B_sample_merged has insufficient shards (1). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 160 |
+
[titan] 2025-07-24 18:20:12,410 - root - INFO - Subset [36m/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro:default (p = 0.150)[39m:
|
| 161 |
+
IterableDataset({
|
| 162 |
+
features: ['text', 'cc-path', 'domain', 'lang', 'lang_score', 'timestamp', 'url', 'math_score'],
|
| 163 |
+
num_shards: 100
|
| 164 |
+
})
|
| 165 |
+
[titan] 2025-07-24 18:20:12,411 - root - INFO - Shuffling the dataset with seed 42
|
| 166 |
+
[titan] 2025-07-24 18:20:12,411 - root - WARNING - [31mDataset /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/dataset/megamath-web-pro has insufficient shards (100). Need 256 shards minimum for desired data parallel workers × 32 dataloader workers. Resharding dataset to 256 shards and disabling streaming mode.[39m
|
| 167 |
+
[titan] 2025-07-24 18:20:19,060 - root - INFO - Interleaving 3 datasets with probabilities [0.55, 0.3, 0.15]
|
| 168 |
+
[titan] 2025-07-24 18:20:19,771 - root - INFO - IterableDataset({
|
| 169 |
+
features: ['text', 'content'],
|
| 170 |
+
num_shards: 256
|
| 171 |
+
})
|
| 172 |
+
[titan] 2025-07-24 18:20:19,895 - root - INFO - Building dataloader...
|
| 173 |
+
[titan] 2025-07-24 18:20:19,898 - root - INFO - Loading model config from /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/configs/gdn_6_nsa_1_340M.json
|
| 174 |
+
[titan] 2025-07-24 18:20:19,900 - root - INFO - Building model from the config
|
| 175 |
+
[32mGatedDeltaNetConfig {
|
| 176 |
+
"allow_neg_eigval": false,
|
| 177 |
+
"architectures": [
|
| 178 |
+
"GatedDeltaNetForCausalLM"
|
| 179 |
+
],
|
| 180 |
+
"attn": {
|
| 181 |
+
"block_counts": 16,
|
| 182 |
+
"block_size": 64,
|
| 183 |
+
"layers": [
|
| 184 |
+
5,
|
| 185 |
+
11,
|
| 186 |
+
17,
|
| 187 |
+
23
|
| 188 |
+
],
|
| 189 |
+
"num_heads": 32,
|
| 190 |
+
"num_kv_heads": 2,
|
| 191 |
+
"qkv_bias": false,
|
| 192 |
+
"rope_theta": 160000.0,
|
| 193 |
+
"type": "nsa",
|
| 194 |
+
"window_size": null
|
| 195 |
+
},
|
| 196 |
+
"attn_mode": "chunk",
|
| 197 |
+
"bos_token_id": 1,
|
| 198 |
+
"conv_size": 4,
|
| 199 |
+
"eos_token_id": 2,
|
| 200 |
+
"expand_k": 1,
|
| 201 |
+
"expand_v": 1,
|
| 202 |
+
"fuse_cross_entropy": true,
|
| 203 |
+
"fuse_norm": true,
|
| 204 |
+
"fuse_swiglu": true,
|
| 205 |
+
"head_dim": 256,
|
| 206 |
+
"hidden_act": "swish",
|
| 207 |
+
"hidden_ratio": 4,
|
| 208 |
+
"hidden_size": 1024,
|
| 209 |
+
"initializer_range": 0.02,
|
| 210 |
+
"intermediate_size": null,
|
| 211 |
+
"max_position_embeddings": 8192,
|
| 212 |
+
"model_type": "gated_deltanet",
|
| 213 |
+
"norm_eps": 1e-06,
|
| 214 |
+
"norm_first": false,
|
| 215 |
+
"num_heads": 4,
|
| 216 |
+
"num_hidden_layers": 24,
|
| 217 |
+
"num_v_heads": null,
|
| 218 |
+
"qk_activation": "silu",
|
| 219 |
+
"qk_norm": "l2",
|
| 220 |
+
"tie_word_embeddings": false,
|
| 221 |
+
"torch_dtype": "bfloat16",
|
| 222 |
+
"transformers_version": "4.53.3",
|
| 223 |
+
"use_beta": true,
|
| 224 |
+
"use_cache": true,
|
| 225 |
+
"use_gate": true,
|
| 226 |
+
"use_l2warp": false,
|
| 227 |
+
"use_output_norm": true,
|
| 228 |
+
"use_short_conv": true,
|
| 229 |
+
"vocab_size": 32000
|
| 230 |
+
}
|
| 231 |
+
[39m
|
| 232 |
+
[titan] 2025-07-24 18:20:20,253 - root - INFO - [34m
|
| 233 |
+
GatedDeltaNetForCausalLM(
|
| 234 |
+
(model): GatedDeltaNetModel(
|
| 235 |
+
(embeddings): Embedding(32000, 1024)
|
| 236 |
+
(layers): ModuleList(
|
| 237 |
+
(0-4): 5 x GatedDeltaNetBlock(
|
| 238 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 239 |
+
(attn): GatedDeltaNet(
|
| 240 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 241 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 242 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 243 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 244 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 245 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 246 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 247 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 248 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 249 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 250 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 251 |
+
)
|
| 252 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 253 |
+
(mlp): GatedMLP(
|
| 254 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 255 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 256 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 257 |
+
(swiglu_linear): SwiGLULinear()
|
| 258 |
+
)
|
| 259 |
+
)
|
| 260 |
+
(5): GatedDeltaNetBlock(
|
| 261 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 262 |
+
(attn): NativeSparseAttention(
|
| 263 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 264 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 265 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 266 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 267 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 268 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 269 |
+
)
|
| 270 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 271 |
+
(mlp): GatedMLP(
|
| 272 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 273 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 274 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 275 |
+
(swiglu_linear): SwiGLULinear()
|
| 276 |
+
)
|
| 277 |
+
)
|
| 278 |
+
(6-10): 5 x GatedDeltaNetBlock(
|
| 279 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 280 |
+
(attn): GatedDeltaNet(
|
| 281 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 282 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 283 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 284 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 285 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 286 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 287 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 288 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 289 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 290 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 291 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 292 |
+
)
|
| 293 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 294 |
+
(mlp): GatedMLP(
|
| 295 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 296 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 297 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 298 |
+
(swiglu_linear): SwiGLULinear()
|
| 299 |
+
)
|
| 300 |
+
)
|
| 301 |
+
(11): GatedDeltaNetBlock(
|
| 302 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 303 |
+
(attn): NativeSparseAttention(
|
| 304 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 305 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 306 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 307 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 308 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 309 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 310 |
+
)
|
| 311 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 312 |
+
(mlp): GatedMLP(
|
| 313 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 314 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 315 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 316 |
+
(swiglu_linear): SwiGLULinear()
|
| 317 |
+
)
|
| 318 |
+
)
|
| 319 |
+
(12-16): 5 x GatedDeltaNetBlock(
|
| 320 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 321 |
+
(attn): GatedDeltaNet(
|
| 322 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 323 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 324 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 325 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 326 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 327 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 328 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 329 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 330 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 331 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 332 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 333 |
+
)
|
| 334 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 335 |
+
(mlp): GatedMLP(
|
| 336 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 337 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 338 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 339 |
+
(swiglu_linear): SwiGLULinear()
|
| 340 |
+
)
|
| 341 |
+
)
|
| 342 |
+
(17): GatedDeltaNetBlock(
|
| 343 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 344 |
+
(attn): NativeSparseAttention(
|
| 345 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 346 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 347 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 348 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 349 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 350 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 351 |
+
)
|
| 352 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 353 |
+
(mlp): GatedMLP(
|
| 354 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 355 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 356 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 357 |
+
(swiglu_linear): SwiGLULinear()
|
| 358 |
+
)
|
| 359 |
+
)
|
| 360 |
+
(18-22): 5 x GatedDeltaNetBlock(
|
| 361 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 362 |
+
(attn): GatedDeltaNet(
|
| 363 |
+
(q_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 364 |
+
(k_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 365 |
+
(v_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 366 |
+
(a_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 367 |
+
(b_proj): Linear(in_features=1024, out_features=4, bias=False)
|
| 368 |
+
(q_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 369 |
+
(k_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 370 |
+
(v_conv1d): ShortConvolution(1024, 1024, kernel_size=(4,), stride=(1,), padding=(3,), groups=1024, bias=False, activation=silu, backend=cuda)
|
| 371 |
+
(g_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 372 |
+
(o_norm): FusedRMSNormGated(256, eps=1e-06, activation=swish)
|
| 373 |
+
(o_proj): Linear(in_features=1024, out_features=1024, bias=False)
|
| 374 |
+
)
|
| 375 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 376 |
+
(mlp): GatedMLP(
|
| 377 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 378 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 379 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 380 |
+
(swiglu_linear): SwiGLULinear()
|
| 381 |
+
)
|
| 382 |
+
)
|
| 383 |
+
(23): GatedDeltaNetBlock(
|
| 384 |
+
(attn_norm): RMSNorm(1024, eps=1e-06)
|
| 385 |
+
(attn): NativeSparseAttention(
|
| 386 |
+
(q_proj): Linear(in_features=1024, out_features=2048, bias=False)
|
| 387 |
+
(k_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 388 |
+
(v_proj): Linear(in_features=1024, out_features=128, bias=False)
|
| 389 |
+
(g_proj): Linear(in_features=1024, out_features=96, bias=False)
|
| 390 |
+
(o_proj): Linear(in_features=2048, out_features=1024, bias=False)
|
| 391 |
+
(rotary): RotaryEmbedding(dim=64, base=160000.0, interleaved=False, pos_idx_in_fp32=True)
|
| 392 |
+
)
|
| 393 |
+
(mlp_norm): RMSNorm(1024, eps=1e-06)
|
| 394 |
+
(mlp): GatedMLP(
|
| 395 |
+
(gate_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 396 |
+
(up_proj): Linear(in_features=1024, out_features=2816, bias=False)
|
| 397 |
+
(down_proj): Linear(in_features=2816, out_features=1024, bias=False)
|
| 398 |
+
(swiglu_linear): SwiGLULinear()
|
| 399 |
+
)
|
| 400 |
+
)
|
| 401 |
+
)
|
| 402 |
+
(norm): RMSNorm(1024, eps=1e-06)
|
| 403 |
+
)
|
| 404 |
+
(lm_head): Linear(in_features=1024, out_features=32000, bias=False)
|
| 405 |
+
(criterion): FusedLinearCrossEntropyLoss()
|
| 406 |
+
)[39m
|
| 407 |
+
|
| 408 |
+
[titan] 2025-07-24 18:20:20,288 - root - INFO - Compiling each block with torch.compile
|
| 409 |
+
[titan] 2025-07-24 18:20:20,288 - root - INFO - Compiling the embedding, norm, and lm_head layers with torch.compile
|
| 410 |
+
[titan] 2025-07-24 18:20:20,289 - root - INFO - Compiling the entire model with torch.compile
|
| 411 |
+
[titan] 2025-07-24 18:20:20,376 - root - INFO - Applied FSDP to the model
|
| 412 |
+
[titan] 2025-07-24 18:20:20,441 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `A_log` is a DTensor, skipping initialization
|
| 413 |
+
[titan] 2025-07-24 18:20:20,442 - fla.models.gated_deltanet.modeling_gated_deltanet - WARNING - `dt_bias` is a DTensor, skipping initialization
|
| 414 |
+
[titan] 2025-07-24 18:20:20,539 - root - INFO - CUDA memory usage for model: 0.10GiB(0.10%)
|
| 415 |
+
[titan] 2025-07-24 18:20:20,541 - root - WARNING - Warmup (100) + decay (95366) steps exceed total training steps (95366). Adjusting decay steps to 95266.
|
| 416 |
+
[titan] 2025-07-24 18:20:20,564 - root - INFO - Checkpointing active. Checkpoints will be loaded from and saved to /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/checkpoint
|
| 417 |
+
[titan] 2025-07-24 18:20:20,565 - root - INFO - CUDA capacity: NVIDIA H20 with 95.00GiB memory
|
| 418 |
+
[titan] 2025-07-24 18:20:20,622 - root - WARNING - Peak flops undefined for: NVIDIA H20, fallback to A100
|
| 419 |
+
[titan] 2025-07-24 18:20:29,001 - root - INFO - [31m***** Running training *****[39m
|
| 420 |
+
[titan] 2025-07-24 18:20:29,002 - root - INFO - [32m Training starts at step 1
|
| 421 |
+
[titan] 2025-07-24 18:20:29,002 - root - INFO - [32m Number of tokens per sequence = 8,192
|
| 422 |
+
[titan] 2025-07-24 18:20:29,002 - root - INFO - [32m Gradient Accumulation steps = 2
|
| 423 |
+
[titan] 2025-07-24 18:20:29,004 - root - INFO - [32m Instantaneous batch size (per device) = 8
|
| 424 |
+
[titan] 2025-07-24 18:20:29,005 - root - INFO - [32m Global batch size (w. parallel, distributed & accumulation) = 128 (1,048,576 tokens)
|
| 425 |
+
[titan] 2025-07-24 18:20:29,005 - root - INFO - [32m Total optimization steps = 95,366 (99,998,498,816 tokens)
|
| 426 |
+
[titan] 2025-07-24 18:20:29,005 - root - INFO - [32m Warmup steps = 100 (104,857,600 tokens)
|
| 427 |
+
[titan] 2025-07-24 18:20:29,005 - root - INFO - [32m Number of parameters = 396,695,712 [39m
|
| 428 |
+
[titan] 2025-07-24 18:20:29,006 - root - INFO - Profiling active. Traces will be saved at /mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/pretrain-linear-moe/flame/exp/bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/profile_trace
|
| 429 |
+
/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/functions.py:1263: UserWarning: Dynamo does not know how to trace the builtin `cuda_utils.get_device_properties.` This function is either a Python builtin (e.g. _warnings.warn) or a third-party C/C++ Python extension (perhaps created with pybind).
|
| 430 |
+
If it is a Python builtin, please file an issue on GitHub so the PyTorch team can add support for it and see the next case for a workaround.
|
| 431 |
+
If it is a third-party C/C++ Python extension, please either wrap it into a PyTorch-understood custom operator (see https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html for more details) or, if it is traceable, use `torch.compiler.allow_in_graph`.
|
| 432 |
+
torch._dynamo.utils.warn_once(explanation + "\n" + "\n".join(hints))
|
| 433 |
+
[rank7]: Traceback (most recent call last):
|
| 434 |
+
[rank7]: File "<frozen runpy>", line 198, in _run_module_as_main
|
| 435 |
+
[rank7]: File "<frozen runpy>", line 88, in _run_code
|
| 436 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 616, in <module>
|
| 437 |
+
[rank7]: main(config)
|
| 438 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 355, in wrapper
|
| 439 |
+
[rank7]: return f(*args, **kwargs)
|
| 440 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^
|
| 441 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/flame/train.py", line 488, in main
|
| 442 |
+
[rank7]: output = model(
|
| 443 |
+
[rank7]: ^^^^^^
|
| 444 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 445 |
+
[rank7]: return self._call_impl(*args, **kwargs)
|
| 446 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 447 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 448 |
+
[rank7]: return inner()
|
| 449 |
+
[rank7]: ^^^^^^^
|
| 450 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 451 |
+
[rank7]: result = forward_call(*args, **kwargs)
|
| 452 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 453 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
|
| 454 |
+
[rank7]: return func(*args, **kwargs)
|
| 455 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^
|
| 456 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 424, in forward
|
| 457 |
+
[rank7]: outputs = self.model(
|
| 458 |
+
[rank7]: ^^^^^^^^^^^
|
| 459 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 460 |
+
[rank7]: return self._call_impl(*args, **kwargs)
|
| 461 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 462 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 463 |
+
[rank7]: return forward_call(*args, **kwargs)
|
| 464 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 465 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 294, in forward
|
| 466 |
+
[rank7]: hidden_states, attentions, past_key_values = layer(
|
| 467 |
+
[rank7]: ^^^^^^
|
| 468 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 469 |
+
[rank7]: return self._call_impl(*args, **kwargs)
|
| 470 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 471 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1857, in _call_impl
|
| 472 |
+
[rank7]: return inner()
|
| 473 |
+
[rank7]: ^^^^^^^
|
| 474 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1805, in inner
|
| 475 |
+
[rank7]: result = forward_call(*args, **kwargs)
|
| 476 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 477 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 655, in _fn
|
| 478 |
+
[rank7]: return fn(*args, **kwargs)
|
| 479 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^
|
| 480 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 481 |
+
[rank7]: return self._call_impl(*args, **kwargs)
|
| 482 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 483 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 484 |
+
[rank7]: return forward_call(*args, **kwargs)
|
| 485 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 486 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 108, in forward
|
| 487 |
+
[rank7]: hidden_states = self.attn_norm(hidden_states)
|
| 488 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/models/gated_deltanet/modeling_gated_deltanet.py", line 109, in torch_dynamo_resume_in_forward_at_108
|
| 489 |
+
[rank7]: hidden_states, attentions, past_key_values = self.attn(
|
| 490 |
+
[rank7]: ^^^^^^^^^^
|
| 491 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1751, in _wrapped_call_impl
|
| 492 |
+
[rank7]: return self._call_impl(*args, **kwargs)
|
| 493 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 494 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1762, in _call_impl
|
| 495 |
+
[rank7]: return forward_call(*args, **kwargs)
|
| 496 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 497 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 108, in forward
|
| 498 |
+
[rank7]: q, k = self.rotary(q, k, seqlen_offset=seqlen_offset, max_seqlen=max_seqlen, cu_seqlens=cu_seqlens)
|
| 499 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/layers/nsa.py", line 123, in torch_dynamo_resume_in_forward_at_108
|
| 500 |
+
[rank7]: o = parallel_nsa(
|
| 501 |
+
[rank7]: ^^^^^^^^^^^^^
|
| 502 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 838, in parallel_nsa
|
| 503 |
+
[rank7]: o_cmp, lse_cmp = parallel_nsa_compression(
|
| 504 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 857, in torch_dynamo_resume_in_parallel_nsa_at_838
|
| 505 |
+
[rank7]: o = o_slc = ParallelNSAFunction.apply(q, k, v, block_indices, block_counts, block_size, scale, cu_seqlens)
|
| 506 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1432, in __call__
|
| 507 |
+
[rank7]: return self._torchdynamo_orig_callable(
|
| 508 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 509 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1213, in __call__
|
| 510 |
+
[rank7]: result = self._inner_convert(
|
| 511 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^
|
| 512 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 598, in __call__
|
| 513 |
+
[rank7]: return _compile(
|
| 514 |
+
[rank7]: ^^^^^^^^^
|
| 515 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1110, in _compile
|
| 516 |
+
[rank7]: raise InternalTorchDynamoError(
|
| 517 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 1059, in _compile
|
| 518 |
+
[rank7]: guarded_code = compile_inner(code, one_graph, hooks, transform)
|
| 519 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 520 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_utils_internal.py", line 97, in wrapper_function
|
| 521 |
+
[rank7]: return function(*args, **kwargs)
|
| 522 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 523 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 761, in compile_inner
|
| 524 |
+
[rank7]: return _compile_inner(code, one_graph, hooks, transform)
|
| 525 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 526 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 797, in _compile_inner
|
| 527 |
+
[rank7]: out_code = transform_code_object(code, transform)
|
| 528 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 529 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py", line 1422, in transform_code_object
|
| 530 |
+
[rank7]: transformations(instructions, code_options)
|
| 531 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 257, in _fn
|
| 532 |
+
[rank7]: return fn(*args, **kwargs)
|
| 533 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^
|
| 534 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 715, in transform
|
| 535 |
+
[rank7]: tracer.run()
|
| 536 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 3498, in run
|
| 537 |
+
[rank7]: super().run()
|
| 538 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1337, in run
|
| 539 |
+
[rank7]: while self.step():
|
| 540 |
+
[rank7]: ^^^^^^^^^^^
|
| 541 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 1246, in step
|
| 542 |
+
[rank7]: self.dispatch_table[inst.opcode](self, inst)
|
| 543 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/symbolic_convert.py", line 2157, in COMPARE_OP
|
| 544 |
+
[rank7]: self.push(compare_op_handlers[inst.argval](self, self.popn(2), {}))
|
| 545 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 546 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 547 |
+
[rank7]: return handler(tx, args, kwargs)
|
| 548 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 549 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 789, in <lambda>
|
| 550 |
+
[rank7]: return lambda tx, args, kwargs: obj.call_function(
|
| 551 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^
|
| 552 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 1111, in call_function
|
| 553 |
+
[rank7]: return handler(tx, args, kwargs)
|
| 554 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^
|
| 555 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 945, in builtin_dispatch
|
| 556 |
+
[rank7]: rv = fn(tx, args, kwargs)
|
| 557 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^
|
| 558 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 839, in call_binop_handlers
|
| 559 |
+
[rank7]: rv = fn(tx, *args)
|
| 560 |
+
[rank7]: ^^^^^^^^^^^^^
|
| 561 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flame/.venv/lib/python3.11/site-packages/torch/_dynamo/variables/builtin.py", line 533, in compare_by_value
|
| 562 |
+
[rank7]: return ConstantVariable(op(a.value, b.value))
|
| 563 |
+
[rank7]: ^^^^^^^^^^^^^^^^^^^^
|
| 564 |
+
[rank7]: torch._dynamo.exc.InternalTorchDynamoError: TypeError: '>' not supported between instances of 'NoneType' and 'int'
|
| 565 |
+
|
| 566 |
+
[rank7]: from user code:
|
| 567 |
+
[rank7]: File "/mnt/nanjingcephfs/project_wx-rec-alg-bdc-exp/bwzheng/yulan/hyw/dataset/Selection/pretrain-linear-moe/flash-linear-attention/fla/ops/nsa/parallel.py", line 862, in torch_dynamo_resume_in_parallel_nsa_at_857
|
| 568 |
+
[rank7]: if window_size > 0:
|
| 569 |
+
|
| 570 |
+
[rank7]: Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
|
| 571 |
+
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_3w7_34bf/attempt_0/7/stdout.log
ADDED
|
File without changes
|
bf16-gdn_6_nsa_1_340M.json-ctx8192-steps95366-lr3e-4-decay_typelinear-decay_ratio1-bs8-nn1-gas2/logs/none_ij1w4wht/attempt_0/0/stderr.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24ee09962378f99896314a1b7b782822c8cbdf18ec5a1b15bfae2560104cfa85
|
| 3 |
+
size 28822522
|