From 902b0373a45b533795b96a5648169b12d0001e5f Mon Sep 17 00:00:00 2001 From: noctis <970308389@qq.com> Date: Wed, 25 Feb 2026 14:36:52 +0800 Subject: [PATCH] =?UTF-8?q?=E6=A8=A1=E5=9E=8B=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 3 ++- README_CN.md | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++ benchmark.py | 2 +- 3 files changed, 69 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index e009872..7f5cc1f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ data/* -Qwen3-VL-2B-Instruct/* \ No newline at end of file +Qwen3-VL-2B-Instruct/* +__pycache__ \ No newline at end of file diff --git a/README_CN.md b/README_CN.md index 6324477..4df1c6d 100755 --- a/README_CN.md +++ b/README_CN.md @@ -346,3 +346,69 @@ python benchmark.py \ +Qwen3VLForConditionalGeneration( + (model): Qwen3VLModel( + (visual): Qwen3VLVisionModel( + (patch_embed): Qwen3VLVisionPatchEmbed( + (proj): Conv3d(3, 1024, kernel_size=(2, 16, 16), stride=(2, 16, 16)) + ) + (pos_embed): Embedding(2304, 1024) + (rotary_pos_emb): Qwen3VLVisionRotaryEmbedding() + (blocks): ModuleList( + (0-23): 24 x Qwen3VLVisionBlock( + (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (attn): Qwen3VLVisionAttention( + (qkv): Linear(in_features=1024, out_features=3072, bias=True) + (proj): Linear(in_features=1024, out_features=1024, bias=True) + ) + (mlp): Qwen3VLVisionMLP( + (linear_fc1): Linear(in_features=1024, out_features=4096, bias=True) + (linear_fc2): Linear(in_features=4096, out_features=1024, bias=True) + (act_fn): GELUTanh() + ) + ) + ) + (merger): Qwen3VLVisionPatchMerger( + (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (linear_fc1): Linear(in_features=4096, out_features=4096, bias=True) + (act_fn): GELU(approximate='none') + (linear_fc2): Linear(in_features=4096, out_features=2048, bias=True) + ) + (deepstack_merger_list): ModuleList( + (0-2): 3 x Qwen3VLVisionPatchMerger( + (norm): LayerNorm((4096,), eps=1e-06, elementwise_affine=True) + (linear_fc1): Linear(in_features=4096, out_features=4096, bias=True) + (act_fn): GELU(approximate='none') + (linear_fc2): Linear(in_features=4096, out_features=2048, bias=True) + ) + ) + ) + (language_model): Qwen3VLTextModel( + (embed_tokens): Embedding(151936, 2048) + (layers): ModuleList( + (0-27): 28 x Qwen3VLTextDecoderLayer( + (self_attn): Qwen3VLTextAttention( + (q_proj): Linear(in_features=2048, out_features=2048, bias=False) + (k_proj): Linear(in_features=2048, out_features=1024, bias=False) + (v_proj): Linear(in_features=2048, out_features=1024, bias=False) + (o_proj): Linear(in_features=2048, out_features=2048, bias=False) + (q_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06) + (k_norm): Qwen3VLTextRMSNorm((128,), eps=1e-06) + ) + (mlp): Qwen3VLTextMLP( + (gate_proj): Linear(in_features=2048, out_features=6144, bias=False) + (up_proj): Linear(in_features=2048, out_features=6144, bias=False) + (down_proj): Linear(in_features=6144, out_features=2048, bias=False) + (act_fn): SiLUActivation() + ) + (input_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06) + (post_attention_layernorm): Qwen3VLTextRMSNorm((2048,), eps=1e-06) + ) + ) + (norm): Qwen3VLTextRMSNorm((2048,), eps=1e-06) + (rotary_emb): Qwen3VLTextRotaryEmbedding() + ) + ) + (lm_head): Linear(in_features=2048, out_features=151936, bias=False) +) \ No newline at end of file diff --git a/benchmark.py b/benchmark.py index 8bc2e2f..4141767 100755 --- a/benchmark.py +++ b/benchmark.py @@ -588,7 +588,7 @@ def main(): parser.add_argument("--model-path", type=str, default="./Qwen3-VL-2B-Instruct", help="Path to model weights") parser.add_argument("--dataset-path", type=str, default="./data", help="Path to validation dataset") parser.add_argument("--output", type=str, default="result.json", help="Output JSON file path") - parser.add_argument("--num-samples", type=int, default=None, help="Number of samples to evaluate (default: all)") + parser.add_argument("--num-samples", type=int, default=100, help="Number of samples to evaluate (default: all)") parser.add_argument("--random-seed", type=int, default=None, help="Random seed for reproducibility") args = parser.parse_args()