diff --git a/aider/website/_data/o1_polyglot_leaderboard.yml b/aider/website/_data/o1_polyglot_leaderboard.yml index 9badd7a85cc..20e8102ad23 100644 --- a/aider/website/_data/o1_polyglot_leaderboard.yml +++ b/aider/website/_data/o1_polyglot_leaderboard.yml @@ -104,7 +104,7 @@ - dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff test_cases: 225 - model: deepseek-chat + model: DeepSeek Chat V2.5 edit_format: diff commit_hash: a755079-dirty pass_rate_1: 5.3 diff --git a/aider/website/_data/polyglot_leaderboard.yml b/aider/website/_data/polyglot_leaderboard.yml index 9badd7a85cc..b841a1f0c70 100644 --- a/aider/website/_data/polyglot_leaderboard.yml +++ b/aider/website/_data/polyglot_leaderboard.yml @@ -104,7 +104,7 @@ - dirname: 2024-12-21-20-56-21--polyglot-deepseek-diff test_cases: 225 - model: deepseek-chat + model: DeepSeek Chat V2.5 edit_format: diff commit_hash: a755079-dirty pass_rate_1: 5.3 @@ -256,4 +256,30 @@ date: 2024-12-22 versions: 0.69.2.dev seconds_per_case: 12.2 - total_cost: 0.0000 \ No newline at end of file + total_cost: 0.0000 + +- dirname: 2024-12-25-13-31-51--deepseekv3preview-diff2 + test_cases: 225 + model: DeepSeek Chat V3 Preview + edit_format: diff + commit_hash: 0a23c4a-dirty + pass_rate_1: 22.7 + pass_rate_2: 48.4 + pass_num_1: 51 + pass_num_2: 109 + percent_cases_well_formed: 98.7 + error_outputs: 7 + num_malformed_responses: 7 + num_with_malformed_responses: 3 + user_asks: 19 + lazy_comments: 0 + syntax_errors: 0 + indentation_errors: 0 + exhausted_context_windows: 0 + test_timeouts: 8 + total_tests: 225 + command: aider --model deepseek/deepseek-chat + date: 2024-12-25 + versions: 0.69.2.dev + seconds_per_case: 34.8 + total_cost: 0.3369 \ No newline at end of file diff --git a/aider/website/docs/leaderboards/index.md b/aider/website/docs/leaderboards/index.md index ce0b826ff4c..7b308ddd0ea 100644 --- a/aider/website/docs/leaderboards/index.md +++ b/aider/website/docs/leaderboards/index.md @@ -68,12 +68,15 @@ The model also has to successfully apply all its changes to the source file with +### Aider polyglot benchmark results +