From bdda551f0ff9d72d26d75a069687d7532d658227 Mon Sep 17 00:00:00 2001 From: Artyom Beilis Date: Wed, 4 Sep 2024 23:40:39 +0300 Subject: [PATCH] Updated benchmarks for latest version --- benchmark.md | 197 +++++------------- benchmarks/v0.2.0/summary.md | 54 +++++ .../v0.2.0/test-A380-opencl-batch32.txt | 14 ++ benchmarks/v0.2.0/test-A380-xpu-batch32.txt | 14 ++ .../v0.2.0/test-gtx960-cuda-batch32.txt | 14 ++ .../v0.2.0/test-gtx960-opencl-batch32.txt | 14 ++ .../v0.2.0/test-rx6600-opencl-batch64.txt | 14 ++ .../v0.2.0/test-rx6600-rocm-batch64.txt | 14 ++ benchmarks/v0.2.0/train-A380-opencl-small.txt | 14 ++ benchmarks/v0.2.0/train-A380-xpu-small.txt | 14 ++ benchmarks/v0.2.0/train-gtx960-cuda-small.txt | 14 ++ .../v0.2.0/train-gtx960-opencl-small.txt | 14 ++ .../v0.2.0/train-rx6600-opencl-large.txt | 14 ++ benchmarks/v0.2.0/train-rx6600-rocm-large.txt | 14 ++ version.txt | 2 +- 15 files changed, 277 insertions(+), 144 deletions(-) create mode 100644 benchmarks/v0.2.0/summary.md create mode 100644 benchmarks/v0.2.0/test-A380-opencl-batch32.txt create mode 100644 benchmarks/v0.2.0/test-A380-xpu-batch32.txt create mode 100644 benchmarks/v0.2.0/test-gtx960-cuda-batch32.txt create mode 100644 benchmarks/v0.2.0/test-gtx960-opencl-batch32.txt create mode 100644 benchmarks/v0.2.0/test-rx6600-opencl-batch64.txt create mode 100644 benchmarks/v0.2.0/test-rx6600-rocm-batch64.txt create mode 100644 benchmarks/v0.2.0/train-A380-opencl-small.txt create mode 100644 benchmarks/v0.2.0/train-A380-xpu-small.txt create mode 100644 benchmarks/v0.2.0/train-gtx960-cuda-small.txt create mode 100644 benchmarks/v0.2.0/train-gtx960-opencl-small.txt create mode 100644 benchmarks/v0.2.0/train-rx6600-opencl-large.txt create mode 100644 benchmarks/v0.2.0/train-rx6600-rocm-large.txt diff --git a/benchmark.md b/benchmark.md index ea8733e..5791ae7 100644 --- a/benchmark.md +++ b/benchmark.md @@ -1,143 +1,54 @@ -# Benchmarks - -Below benchmarks done for comparison of rx6600xt and gtx960 - GPUs -of cuda and rocm backends vs `pytorch_ocl` - -Depending on the network training performance is around 60 to 90 percent -inference performance is somewhat better. - -Notes: time in ms per batch - smaller is better, input is standard imagenet -input Batchx3x224x224 - - -## Training - - - rx6600xt/8gb batch size rocm/hip opencl Raito % - alexnet 64 57.848 82.381 70.2 - resnet18 64 146.917 238.889 61.5 - resnet50 32 266.441 357.985 74.4 - convnext_small 16 337.252 583.794 57.8 - vgg16 16 206.312 348.692 59.2 - densenet161 16 296.807 485.035 61.2 - mobilenet_v2 32 157.476 197.886 79.6 - mobilenet_v3_small 64 92.506 120.406 76.8 - mobilenet_v3_large 64 286.795 319.938 89.6 - resnext50_32x4d 32 336.464 491.112 68.5 - wide_resnet50_2 32 466.841 642.973 72.6 - mnasnet1_0 32 159.97 167.306 95.6 - efficientnet_b0 32 205.69 305.157 67.4 - regnet_y_400mf 64 171.691 244.587 70.2 - - Average 71.8 - - gtx960/4gb batch size c cuda opencl Raito % - alexnet 64 128.142 270.006 47.5 - resnet18 64 415.589 746.578 55.7 - resnet50 16 373.932 599.182 62.4 - convnext_small 8 1128.995 1175.585 96.0 - vgg16 8 364.176 561.695 64.8 - densenet161 8 463.427 728.693 63.6 - mobilenet_v2 16 173.13 352.728 49.1 - mobilenet_v3_small 32 101.621 206.353 49.2 - mobilenet_v3_large 32 263.055 523.575 50.2 - resnext50_32x4d 16 539.007 846.71 63.7 - wide_resnet50_2 16 677.57 1040.154 65.1 - mnasnet1_0 16 167.542 322.004 52.0 - efficientnet_b0 16 241.023 540.09 44.6 - regnet_y_400mf 32 353.889 391.025 90.5 - - Average 61.0 - -## Inference - -Note, since my AMD and Nvidia gpus have different memory size differnet -batch sizes were used - - - rx6600xt/8gb rocm/hip opencl Ratio % Batch=64 - convnext_small 476.549 600.921 79.3 - alexnet 24.587 26.311 93.4 - resnet18 41.375 59.375 69.7 - resnet50 165.261 194.512 85.0 - vgg16 205.124 309.937 66.2 - densenet161 409.38 414.496 98.8 - inception_v3 90.635 131.685 68.8 - mobilenet_v2 77.691 93.701 82.9 - mobilenet_v3_small 22.203 26.151 84.9 - mobilenet_v3_large 63.229 70.458 89.7 - resnext50_32x4d 244.676 274.791 89.0 - wide_resnet50_2 320.313 402.687 79.5 - mnasnet1_0 74.141 75.162 98.6 - efficientnet_b0 104.396 114.898 90.9 - efficientnet_b4 303.468 276.226 109.9 - regnet_y_400mf 43.298 57.491 75.3 - - Average 85.1 - - gtx960/4gb cuda opencl Ratio % Batch=32 - convnext_small 751.713 1206.871 62.3 - alexnet 29.446 44.27 66.5 - resnet18 66.053 93.352 70.8 - resnet50 214.787 316.754 67.8 - vgg16 350.278 486.743 72.0 - densenet161 511.183 587.856 87.0 - inception_v3 167.233 217.664 76.8 - mobilenet_v2 86.572 161.797 53.5 - mobilenet_v3_small 27.748 49.359 56.2 - mobilenet_v3_large 68.79 121.644 56.6 - resnext50_32x4d 284.697 440.466 64.6 - wide_resnet50_2 376.114 587.801 64.0 - mnasnet1_0 82.576 132.463 62.3 - efficientnet_b0 111.154 202.593 54.9 - efficientnet_b4 299.779 499.841 60.0 - regnet_y_400mf 99.336 95.446 104.1 - - Average 67.5 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - ppppppppppppppppppp +# Setup + +Tested 3 setups, pytorch 2.4 + +1. AMD rx6600XT, OpenCL drivers vs official ROCM pytorch (6.1) +2. NVidia rx960, OpenCL drivers vs official CUDA 12.2 +3. Inter Arc A380, OpenCL NEO driver vs XPU - intel extension for pytorch (2.1 since it what was released) + +Input is standard Image net batchx3x224x224, time in milliseconds, lower is better. + +# Training + + + +|AMD||||||Nvidia||||||Intel||||| +|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| +|rx6600xt|batch|OpenCL|ROCM|% Perf||gtx960|batch|OpenCL|CUDA|% Perf||A380|batch|OpenCL|XPU|% Perf| +|alexnet|64|75.239|57.957|77||alexnet|64|257.09|130.561|51||alexnet|64|482.139|133.512|28| +|resnet18|64|238.927|147.099|62||resnet18|64|695.096|419.69|60||resnet18|64|1044.985|397.738|38| +|resnet50|32|358.872|266.155|74||resnet50|16|591.143|375.644|64||resnet50|16|640.916|329.849|51| +|convnext_small|16|608.297|337.736|56||convnext_small|8|1001.294|1120.676|112||convnext_small|8|841.302|259.292|31| +|vgg16|16|343.962|206.243|60||vgg16|8|520.75|363.288|70||vgg16|8|780.692|479.314|61| +|densenet161|16|494.175|297.001|60||densenet161|8|698.842|464.051|66||densenet161|8|834.207|423.883|51| +|mobilenet_v2|32|206.255|157.743|76||mobilenet_v2|16|335.279|173.748|52||mobilenet_v2|16|405.541|153.694|38| +|mobilenet_v3_small|64|130.571|92.83|71||mobilenet_v3_small|32|196.173|102.561|52||mobilenet_v3_small|32|275.302|92.086|33| +|mobilenet_v3_large|64|330.269|287.3|87||mobilenet_v3_large|32|497.168|264.072|53||mobilenet_v3_large|32|642.568|226.292|35| +|resnext50_32x4d|32|490.971|336.183|68||resnext50_32x4d|16|807.178|539.026|67||resnext50_32x4d|16|1068.918|396.39|37| +|wide_resnet50_2|32|643.083|468.04|73||wide_resnet50_2|16|1023.105|677.723|66||wide_resnet50_2|16|1373.346|634.213|46| +|mnasnet1_0|32|167.934|160.254|95||mnasnet1_0|16|302.854|167.911|55||mnasnet1_0|16|383.069|126.56|33| +|efficientnet_b0|32|313.972|205.674|66||efficientnet_b0|16|515.058|241.311|47||efficientnet_b0|16|531.724|203.157|38| +|regnet_y_400mf|64|246.069|171.841|70||regnet_y_400mf|32|361.507|353.584|98||regnet_y_400mf|32|635.279|224.228|35| +|Average||||71||Average||||65||Average||||40| + +# Inference + + +|AMD||||||Nvidia||||||Intel||||| +|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| +|rx6600xt|batch|OpenCL|ROCM|% Perf||gtx960|batch|OpenCL|CUDA|% Perf||A380|batch|OpenCL|XPU|% Perf| +|alexnet|64|24.543|24.642|100||alexnet|32|45.007|30.271|67||alexnet|32|55.5|25.835|47| +|resnet18|64|59.428|41.569|70||resnet18|32|94.044|66.61|71||resnet18|32|113.002|55.647|49| +|resnet50|64|196.75|165.706|84||resnet50|32|316.899|215.245|68||resnet50|32|271.778|145.842|54| +|convnext_small|64|632.215|478.088|76||convnext_small|32|881.586|751.286|85||convnext_small|32|670.291|294.405|44| +|vgg16|64|310.767|205.745|66||vgg16|32|490.68|351.488|72||vgg16|32|801.684|333.954|42| +|densenet161|64|415.707|410.906|99||densenet161|32|589.712|510.883|87||densenet161|32|685.154|315.407|46| +|mobilenet_v2|64|93.699|77.774|83||mobilenet_v2|32|162.4|87.376|54||mobilenet_v2|32|100.363|51.589|51| +|mobilenet_v3_small|64|25.653|22.253|87||mobilenet_v3_small|32|50.097|28.739|57||mobilenet_v3_small|32|36.92|26.508|72| +|mobilenet_v3_large|64|70.409|63.28|90||mobilenet_v3_large|32|122.416|69.432|57||mobilenet_v3_large|32|84.413|52.328|62| +|resnext50_32x4d|64|274.967|245.411|89||resnext50_32x4d|32|440.411|284.571|65||resnext50_32x4d|32|359.037|169.194|47| +|wide_resnet50_2|64|404.214|321.398|80||wide_resnet50_2|32|589.164|376.938|64||wide_resnet50_2|32|682.184|321.014|47| +|mnasnet1_0|64|75.027|74.211|99||mnasnet1_0|32|133.324|83.407|63||mnasnet1_0|32|91.441|51.785|57| +|efficientnet_b0|64|114.735|104.417|91||efficientnet_b0|32|203.531|111.822|55||efficientnet_b0|32|129.755|88.131|68| +|regnet_y_400mf|64|57.408|43.313|75||regnet_y_400mf|32|96.079|99.022|103||regnet_y_400mf|32|87.756|56.503|64| +|Average||||85||Average||||69||Average||||54| diff --git a/benchmarks/v0.2.0/summary.md b/benchmarks/v0.2.0/summary.md new file mode 100644 index 0000000..5791ae7 --- /dev/null +++ b/benchmarks/v0.2.0/summary.md @@ -0,0 +1,54 @@ +# Setup + +Tested 3 setups, pytorch 2.4 + +1. AMD rx6600XT, OpenCL drivers vs official ROCM pytorch (6.1) +2. NVidia rx960, OpenCL drivers vs official CUDA 12.2 +3. Inter Arc A380, OpenCL NEO driver vs XPU - intel extension for pytorch (2.1 since it what was released) + +Input is standard Image net batchx3x224x224, time in milliseconds, lower is better. + +# Training + + + +|AMD||||||Nvidia||||||Intel||||| +|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| +|rx6600xt|batch|OpenCL|ROCM|% Perf||gtx960|batch|OpenCL|CUDA|% Perf||A380|batch|OpenCL|XPU|% Perf| +|alexnet|64|75.239|57.957|77||alexnet|64|257.09|130.561|51||alexnet|64|482.139|133.512|28| +|resnet18|64|238.927|147.099|62||resnet18|64|695.096|419.69|60||resnet18|64|1044.985|397.738|38| +|resnet50|32|358.872|266.155|74||resnet50|16|591.143|375.644|64||resnet50|16|640.916|329.849|51| +|convnext_small|16|608.297|337.736|56||convnext_small|8|1001.294|1120.676|112||convnext_small|8|841.302|259.292|31| +|vgg16|16|343.962|206.243|60||vgg16|8|520.75|363.288|70||vgg16|8|780.692|479.314|61| +|densenet161|16|494.175|297.001|60||densenet161|8|698.842|464.051|66||densenet161|8|834.207|423.883|51| +|mobilenet_v2|32|206.255|157.743|76||mobilenet_v2|16|335.279|173.748|52||mobilenet_v2|16|405.541|153.694|38| +|mobilenet_v3_small|64|130.571|92.83|71||mobilenet_v3_small|32|196.173|102.561|52||mobilenet_v3_small|32|275.302|92.086|33| +|mobilenet_v3_large|64|330.269|287.3|87||mobilenet_v3_large|32|497.168|264.072|53||mobilenet_v3_large|32|642.568|226.292|35| +|resnext50_32x4d|32|490.971|336.183|68||resnext50_32x4d|16|807.178|539.026|67||resnext50_32x4d|16|1068.918|396.39|37| +|wide_resnet50_2|32|643.083|468.04|73||wide_resnet50_2|16|1023.105|677.723|66||wide_resnet50_2|16|1373.346|634.213|46| +|mnasnet1_0|32|167.934|160.254|95||mnasnet1_0|16|302.854|167.911|55||mnasnet1_0|16|383.069|126.56|33| +|efficientnet_b0|32|313.972|205.674|66||efficientnet_b0|16|515.058|241.311|47||efficientnet_b0|16|531.724|203.157|38| +|regnet_y_400mf|64|246.069|171.841|70||regnet_y_400mf|32|361.507|353.584|98||regnet_y_400mf|32|635.279|224.228|35| +|Average||||71||Average||||65||Average||||40| + +# Inference + + +|AMD||||||Nvidia||||||Intel||||| +|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-|-| +|rx6600xt|batch|OpenCL|ROCM|% Perf||gtx960|batch|OpenCL|CUDA|% Perf||A380|batch|OpenCL|XPU|% Perf| +|alexnet|64|24.543|24.642|100||alexnet|32|45.007|30.271|67||alexnet|32|55.5|25.835|47| +|resnet18|64|59.428|41.569|70||resnet18|32|94.044|66.61|71||resnet18|32|113.002|55.647|49| +|resnet50|64|196.75|165.706|84||resnet50|32|316.899|215.245|68||resnet50|32|271.778|145.842|54| +|convnext_small|64|632.215|478.088|76||convnext_small|32|881.586|751.286|85||convnext_small|32|670.291|294.405|44| +|vgg16|64|310.767|205.745|66||vgg16|32|490.68|351.488|72||vgg16|32|801.684|333.954|42| +|densenet161|64|415.707|410.906|99||densenet161|32|589.712|510.883|87||densenet161|32|685.154|315.407|46| +|mobilenet_v2|64|93.699|77.774|83||mobilenet_v2|32|162.4|87.376|54||mobilenet_v2|32|100.363|51.589|51| +|mobilenet_v3_small|64|25.653|22.253|87||mobilenet_v3_small|32|50.097|28.739|57||mobilenet_v3_small|32|36.92|26.508|72| +|mobilenet_v3_large|64|70.409|63.28|90||mobilenet_v3_large|32|122.416|69.432|57||mobilenet_v3_large|32|84.413|52.328|62| +|resnext50_32x4d|64|274.967|245.411|89||resnext50_32x4d|32|440.411|284.571|65||resnext50_32x4d|32|359.037|169.194|47| +|wide_resnet50_2|64|404.214|321.398|80||wide_resnet50_2|32|589.164|376.938|64||wide_resnet50_2|32|682.184|321.014|47| +|mnasnet1_0|64|75.027|74.211|99||mnasnet1_0|32|133.324|83.407|63||mnasnet1_0|32|91.441|51.785|57| +|efficientnet_b0|64|114.735|104.417|91||efficientnet_b0|32|203.531|111.822|55||efficientnet_b0|32|129.755|88.131|68| +|regnet_y_400mf|64|57.408|43.313|75||regnet_y_400mf|32|96.079|99.022|103||regnet_y_400mf|32|87.756|56.503|64| +|Average||||85||Average||||69||Average||||54| diff --git a/benchmarks/v0.2.0/test-A380-opencl-batch32.txt b/benchmarks/v0.2.0/test-A380-opencl-batch32.txt new file mode 100644 index 0000000..daa8d82 --- /dev/null +++ b/benchmarks/v0.2.0/test-A380-opencl-batch32.txt @@ -0,0 +1,14 @@ +alexnet 55.500 +resnet18 113.002 +resnet50 271.778 +convnext_small 670.291 +vgg16 801.684 +densenet161 685.154 +mobilenet_v2 100.363 +mobilenet_v3_small 36.920 +mobilenet_v3_large 84.413 +resnext50_32x4d 359.037 +wide_resnet50_2 682.184 +mnasnet1_0 91.441 +efficientnet_b0 129.755 +regnet_y_400mf 87.756 diff --git a/benchmarks/v0.2.0/test-A380-xpu-batch32.txt b/benchmarks/v0.2.0/test-A380-xpu-batch32.txt new file mode 100644 index 0000000..1d7bf78 --- /dev/null +++ b/benchmarks/v0.2.0/test-A380-xpu-batch32.txt @@ -0,0 +1,14 @@ +alexnet 25.835 +resnet18 55.647 +resnet50 145.842 +convnext_small 294.405 +vgg16 333.954 +densenet161 315.407 +mobilenet_v2 51.589 +mobilenet_v3_small 26.508 +mobilenet_v3_large 52.328 +resnext50_32x4d 169.194 +wide_resnet50_2 321.014 +mnasnet1_0 51.785 +efficientnet_b0 88.131 +regnet_y_400mf 56.503 diff --git a/benchmarks/v0.2.0/test-gtx960-cuda-batch32.txt b/benchmarks/v0.2.0/test-gtx960-cuda-batch32.txt new file mode 100644 index 0000000..a837f05 --- /dev/null +++ b/benchmarks/v0.2.0/test-gtx960-cuda-batch32.txt @@ -0,0 +1,14 @@ +alexnet 30.271 +resnet18 66.610 +resnet50 215.245 +convnext_small 751.286 +vgg16 351.488 +densenet161 510.883 +mobilenet_v2 87.376 +mobilenet_v3_small 28.739 +mobilenet_v3_large 69.432 +resnext50_32x4d 284.571 +wide_resnet50_2 376.938 +mnasnet1_0 83.407 +efficientnet_b0 111.822 +regnet_y_400mf 99.022 diff --git a/benchmarks/v0.2.0/test-gtx960-opencl-batch32.txt b/benchmarks/v0.2.0/test-gtx960-opencl-batch32.txt new file mode 100644 index 0000000..f82bd0d --- /dev/null +++ b/benchmarks/v0.2.0/test-gtx960-opencl-batch32.txt @@ -0,0 +1,14 @@ +alexnet 45.007 +resnet18 94.044 +resnet50 316.899 +convnext_small 881.586 +vgg16 490.680 +densenet161 589.712 +mobilenet_v2 162.400 +mobilenet_v3_small 50.097 +mobilenet_v3_large 122.416 +resnext50_32x4d 440.411 +wide_resnet50_2 589.164 +mnasnet1_0 133.324 +efficientnet_b0 203.531 +regnet_y_400mf 96.079 diff --git a/benchmarks/v0.2.0/test-rx6600-opencl-batch64.txt b/benchmarks/v0.2.0/test-rx6600-opencl-batch64.txt new file mode 100644 index 0000000..dbcaf43 --- /dev/null +++ b/benchmarks/v0.2.0/test-rx6600-opencl-batch64.txt @@ -0,0 +1,14 @@ +alexnet 24.543 +resnet18 59.428 +resnet50 196.750 +convnext_small 632.215 +vgg16 310.767 +densenet161 415.707 +mobilenet_v2 93.699 +mobilenet_v3_small 25.653 +mobilenet_v3_large 70.409 +resnext50_32x4d 274.967 +wide_resnet50_2 404.214 +mnasnet1_0 75.027 +efficientnet_b0 114.735 +regnet_y_400mf 57.408 diff --git a/benchmarks/v0.2.0/test-rx6600-rocm-batch64.txt b/benchmarks/v0.2.0/test-rx6600-rocm-batch64.txt new file mode 100644 index 0000000..975924e --- /dev/null +++ b/benchmarks/v0.2.0/test-rx6600-rocm-batch64.txt @@ -0,0 +1,14 @@ +alexnet 24.642 +resnet18 41.569 +resnet50 165.706 +convnext_small 478.088 +vgg16 205.745 +densenet161 410.906 +mobilenet_v2 77.774 +mobilenet_v3_small 22.253 +mobilenet_v3_large 63.280 +resnext50_32x4d 245.411 +wide_resnet50_2 321.398 +mnasnet1_0 74.211 +efficientnet_b0 104.417 +regnet_y_400mf 43.313 diff --git a/benchmarks/v0.2.0/train-A380-opencl-small.txt b/benchmarks/v0.2.0/train-A380-opencl-small.txt new file mode 100644 index 0000000..9ff2b51 --- /dev/null +++ b/benchmarks/v0.2.0/train-A380-opencl-small.txt @@ -0,0 +1,14 @@ + alexnet 64 482.139 + resnet18 64 1044.985 + resnet50 16 640.916 + convnext_small 8 841.302 + vgg16 8 780.692 + densenet161 8 834.207 + mobilenet_v2 16 405.541 + mobilenet_v3_small 32 275.302 + mobilenet_v3_large 32 642.568 + resnext50_32x4d 16 1068.918 + wide_resnet50_2 16 1373.346 + mnasnet1_0 16 383.069 + efficientnet_b0 16 531.724 + regnet_y_400mf 32 635.279 diff --git a/benchmarks/v0.2.0/train-A380-xpu-small.txt b/benchmarks/v0.2.0/train-A380-xpu-small.txt new file mode 100644 index 0000000..baa7d1a --- /dev/null +++ b/benchmarks/v0.2.0/train-A380-xpu-small.txt @@ -0,0 +1,14 @@ + alexnet 64 133.512 + resnet18 64 397.738 + resnet50 16 329.849 + convnext_small 8 259.292 + vgg16 8 479.314 + densenet161 8 423.883 + mobilenet_v2 16 153.694 + mobilenet_v3_small 32 92.086 + mobilenet_v3_large 32 226.292 + resnext50_32x4d 16 396.390 + wide_resnet50_2 16 634.213 + mnasnet1_0 16 126.560 + efficientnet_b0 16 203.157 + regnet_y_400mf 32 224.228 diff --git a/benchmarks/v0.2.0/train-gtx960-cuda-small.txt b/benchmarks/v0.2.0/train-gtx960-cuda-small.txt new file mode 100644 index 0000000..7575e75 --- /dev/null +++ b/benchmarks/v0.2.0/train-gtx960-cuda-small.txt @@ -0,0 +1,14 @@ + alexnet 64 130.561 + resnet18 64 419.690 + resnet50 16 375.644 + convnext_small 8 1120.676 + vgg16 8 363.288 + densenet161 8 464.051 + mobilenet_v2 16 173.748 + mobilenet_v3_small 32 102.561 + mobilenet_v3_large 32 264.072 + resnext50_32x4d 16 539.026 + wide_resnet50_2 16 677.723 + mnasnet1_0 16 167.911 + efficientnet_b0 16 241.311 + regnet_y_400mf 32 353.584 diff --git a/benchmarks/v0.2.0/train-gtx960-opencl-small.txt b/benchmarks/v0.2.0/train-gtx960-opencl-small.txt new file mode 100644 index 0000000..9936635 --- /dev/null +++ b/benchmarks/v0.2.0/train-gtx960-opencl-small.txt @@ -0,0 +1,14 @@ + alexnet 64 257.090 + resnet18 64 695.096 + resnet50 16 591.143 + convnext_small 8 1001.294 + vgg16 8 520.750 + densenet161 8 698.842 + mobilenet_v2 16 335.279 + mobilenet_v3_small 32 196.173 + mobilenet_v3_large 32 497.168 + resnext50_32x4d 16 807.178 + wide_resnet50_2 16 1023.105 + mnasnet1_0 16 302.854 + efficientnet_b0 16 515.058 + regnet_y_400mf 32 361.507 diff --git a/benchmarks/v0.2.0/train-rx6600-opencl-large.txt b/benchmarks/v0.2.0/train-rx6600-opencl-large.txt new file mode 100644 index 0000000..381a102 --- /dev/null +++ b/benchmarks/v0.2.0/train-rx6600-opencl-large.txt @@ -0,0 +1,14 @@ + alexnet 64 75.239 + resnet18 64 238.927 + resnet50 32 358.872 + convnext_small 16 608.297 + vgg16 16 343.962 + densenet161 16 494.175 + mobilenet_v2 32 206.255 + mobilenet_v3_small 64 130.571 + mobilenet_v3_large 64 330.269 + resnext50_32x4d 32 490.971 + wide_resnet50_2 32 643.083 + mnasnet1_0 32 167.934 + efficientnet_b0 32 313.972 + regnet_y_400mf 64 246.069 diff --git a/benchmarks/v0.2.0/train-rx6600-rocm-large.txt b/benchmarks/v0.2.0/train-rx6600-rocm-large.txt new file mode 100644 index 0000000..8e03186 --- /dev/null +++ b/benchmarks/v0.2.0/train-rx6600-rocm-large.txt @@ -0,0 +1,14 @@ + alexnet 64 57.957 + resnet18 64 147.099 + resnet50 32 266.155 + convnext_small 16 337.736 + vgg16 16 206.243 + densenet161 16 297.001 + mobilenet_v2 32 157.743 + mobilenet_v3_small 64 92.830 + mobilenet_v3_large 64 287.300 + resnext50_32x4d 32 336.183 + wide_resnet50_2 32 468.040 + mnasnet1_0 32 160.254 + efficientnet_b0 32 205.674 + regnet_y_400mf 64 171.841 diff --git a/version.txt b/version.txt index 6e8bf73..0ea3a94 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.1.0 +0.2.0