From 902b0373a45b533795b96a5648169b12d0001e5f Mon Sep 17 00:00:00 2001
From: noctis <970308389@qq.com>
Date: Wed, 25 Feb 2026 14:36:52 +0800
Subject: [PATCH] =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E7=BB=93=E6=9E=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore   |  3 ++-
 README_CN.md | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 benchmark.py |  2 +-
 3 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index e009872..7f5cc1f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 data/*
-Qwen3-VL-2B-Instruct/*
\ No newline at end of file
+Qwen3-VL-2B-Instruct/*
+__pycache__
\ No newline at end of file
diff --git a/README_CN.md b/README_CN.md
index 6324477..4df1c6d 100755
--- a/README_CN.md
+++ b/README_CN.md
@@ -346,3 +346,69 @@ python benchmark.py \
 
 
 
+Qwen3VLForConditionalGeneration(
+  (model): Qwen3VLModel(
+    (visual): Qwen3VLVisionModel(
+      (patch_embed): Qwen3VLVisionPatchEmbed(
+        (proj): Conv3d(3, 1024, kernel_size=(2, 16, 16), stride=(2, 16, 16))
+      )
+      (pos_embed): Embedding(2304, 1024)
+      (rotary_pos_emb): Qwen3VLVisionRotaryEmbedding()
+      (blocks): ModuleList(
+        (0-23): 24 x Qwen3VLVisionBlock(
+          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+          (attn): Qwen3VLVisionAttention(
+            (qkv): Linear(in_features=1024, out_features=3072, bias=True)
+            (proj): Linear(in_features=1024, out_features=1024, bias=True)
+          )
+          (mlp): Qwen3VLVisionMLP(
+            (linear_fc1): Linear(in_features=1024, out_features=4096, bias=True)
+            (linear_fc2): Linear(in_features=4096, out_features=1024, bias=True)
+            (act_fn): GELUTanh()
+          )
+        )
+      )
+      (merger): Qwen3VLVisionPatchMerger(
+        (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
+        (linear_fc1): Linear(in_features=4096, out_features=4096, bias=True)
+        (act_fn): GELU(approximate='none')
+        (linear_fc2): Linear(in_features=4096, out_features=2048, bias=True)
+      )
+      (deepstack_merger_list): ModuleList(
+        (0-2): 3 x Qwen3VLVisionPatchMerger(
+          (norm): LayerNorm((4096,), eps=1e-06, elementwise_affine=True)
+          (linear_fc1): Linear(in_features=4096, out_features=4096, bias=True)
+          (act_fn): GELU(approximate='none')
+          (linear_fc2): Linear(in_features=4096, out_features=2048, bias=True)
+        )
+      )
+    )
+    (language_model): Qwen3VLTextModel(
+      (embed_tokens): Embedding(151936, 2048)
+      (layers): ModuleList(
+        (0-27): 28 x Qwen3VLTextDecoderLayer(
+          (self_attn): Qwen3VLTextAttention(
+            (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
+            (k_proj): Linear(in_features=2048, out_features=1024, bias=False)
+            (v_proj): Linear(in_features=2048, out_features=1024, bias=False)
+            (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
+            (q_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06)
+            (k_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06)
+          )
+          (mlp): Qwen3VLTextMLP(
+            (gate_proj): Linear(in_features=2048, out_features=6144, bias=False)
+            (up_proj): Linear(in_features=2048, out_features=6144, bias=False)
+            (down_proj): Linear(in_features=6144, out_features=2048, bias=False)
+            (act_fn): SiLUActivation()
+          )
+          (input_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)
+          (post_attention_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)
+        )
+      )
+      (norm): Qwen3VLTextRMSNorm((2048,), eps=1e-06)
+      (rotary_emb): Qwen3VLTextRotaryEmbedding()
+    )
+  )
+  (lm_head): Linear(in_features=2048, out_features=151936, bias=False)
+)
\ No newline at end of file
diff --git a/benchmark.py b/benchmark.py
index 8bc2e2f..4141767 100755
--- a/benchmark.py
+++ b/benchmark.py
@@ -588,7 +588,7 @@ def main():
     parser.add_argument("--model-path", type=str, default="./Qwen3-VL-2B-Instruct", help="Path to model weights")
     parser.add_argument("--dataset-path", type=str, default="./data", help="Path to validation dataset")
     parser.add_argument("--output", type=str, default="result.json", help="Output JSON file path")
-    parser.add_argument("--num-samples", type=int, default=None, help="Number of samples to evaluate (default: all)")
+    parser.add_argument("--num-samples", type=int, default=100, help="Number of samples to evaluate (default: all)")
     parser.add_argument("--random-seed", type=int, default=None, help="Random seed for reproducibility")
     
     args = parser.parse_args()