gitlab-gpu-pipeline.yml

default:
    tags:
        - atsccs68-docker-executor
    image:
        name: ravilmobile/seissol-base:gcc-8_cuda-10
        entrypoint: [""]

stages:
    - pre_build
    - build 
    - test
    - performance_eval

gpu_pre_build:
    stage: pre_build
    allow_failure: false
    variables:
        GIT_STRATEGY: clone

    before_script:
        - git branch -vva
        - echo $commit_author_name
    script:
        - echo "HOST arch.:" $HOST
        - echo "GPU model:" $GPU_MODEL
        - nvidia-smi
        - mkdir ./self_usr
        - wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz
        - tar -xf eigen-3.4.0.tar.gz
        - mkdir eigen-3.4.0/build && cd eigen-3.4.0/build && 
          cmake .. -DCMAKE_INSTALL_PREFIX=../../self_usr && make -j $(nproc) install 
          && cd ../..
        - git clone https://github.com/uphoffc/ImpalaJIT.git impalajit &&
          mkdir -p impalajit/build && cd impalajit/build &&
          cmake .. -DCMAKE_INSTALL_PREFIX=../../self_usr -DCMAKE_CXX_FLAGS="-fPIC" &&
          make -j $(nproc) install &&
          cd ../..
        - git clone --depth 1 --branch v1.0.0 https://github.com/SeisSol/easi.git easi &&
          mkdir -p easi/build && cd easi/build &&
          cmake .. -DASAGI=OFF -DIMPALAJIT=ON -DCMAKE_INSTALL_PREFIX=../../self_usr -DCMAKE_CXX_FLAGS="-fPIC" &&
          make -j $(nproc) install &&
          cd ../..
    artifacts:
        paths:
            - ./self_usr

# gcc@9 is required to compile and run hipSYCL@0.9.1. It is based on llvm and emits nvptx code by itself.
# gcc@8 is required for cuda@10.2
# Note: currently we cannot switch to cuda@11 because an old version of the nvidia driver is installed
#       on our GPU CI runner (atsccs68-docker-executor). Once it is updated we will be able
#       to switch to cuda@11 and thus to gcc@9
.common_gpu_test_script: &common_gpu_steps
    - export CTEST_OUTPUT_ON_FAILURE=1
    - if [[ "${BACKEND}" == "hipsycl" ]]; then
        export CC=/usr/bin/gcc-9 ;
        export CXX=/usr/bin/g++-9 ;
        export FC=/usr/bin/gfortran-9 ;
        export LIBRARY_PATH=/usr/local/cuda/lib64:$LIBRARY_PATH ;
        export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH ;
      else
        export CC=/usr/bin/gcc-8 ;
        export CXX=/usr/bin/g++-8 ;
      fi ;
    - if [[ "${BACKEND}" == "hip" ]]; then
        . /etc/profile.d/rocm.sh ;
        export HIP_PLATFORM=nvidia ;
      fi ;
    - ls -l ./self_usr/lib && ls -l ./self_usr/include
    - export CMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH:$PWD/self_usr;
      export LIBRARY_PATH=$LIBRARY_PATH:$PWD/self_usr ;
      export CPATH=$CPATH:$PWD/self_usr ;

gpu_build:
    stage: build
    allow_failure: false
    needs:
        - job: gpu_pre_build
    parallel:
        matrix:
        - BACKEND: [cuda, hipsycl, hip]
    before_script:
        - git submodule init
        - sed -i 's/\.\.\/\.\./https\:\/\/github\.com/g' .gitmodules
        - git submodule sync
        - git submodule --quiet update --init --recursive 
    script:
        - *common_gpu_steps
        - pip3 install git+https://github.com/ravil-mobile/gemmforge.git@v0.0.206
        - set -euo pipefail
        - for precision in double single; do
            mkdir -p ./build_${BACKEND}_${precision} && cd ./build_${BACKEND}_${precision} ;
            cmake ..
            -DCMAKE_BUILD_TYPE=Release
            -DDEVICE_BACKEND=${BACKEND}
            -DDEVICE_ARCH=${GPU_MODEL}
            -DHOST_ARCH=${HOST}
            -DPRECISION=${precision}
            -DPROXY_PYBINDING=ON ;
            make -j 2 ;
            cd .. ;
          done;
        - set +u
    artifacts:
        paths:
            - build_*


gpu_convergence_test:
    stage: test
    allow_failure: false
    needs:
        - job: gpu_build
    parallel:
        matrix:
        - BACKEND: [cuda, hipsycl, hip]
    script:
        - *common_gpu_steps
        - git clone https://github.com/SeisSol/Examples.git tests
        - pip3 install -r ./tests/convergence_elastic/requirements.txt
        - set -euo pipefail
        - for precision in double single; do
            cd ./build_${BACKEND}_${precision} ;
            echo "[${BACKEND}] Elastic Convergence test with precision - ${precision}" ;
            cp -r ../tests/convergence_elastic/* . ;
            PYTHONPATH=$PWD python3 ./elastic_convergence_runner
            --executable $PWD/SeisSol_Release_*
            --tmp-dir /tmp/seissol
            --sizes 4 8 16
            --expected-errors 1e-2 1e-4 5e-5
            --norm-type LInf
            --end-time 0.5
            --allow-run-as-root ;
            cd .. ;
          done;
        - set +u
    artifacts:
        paths:
            - build_*


gpu_performance_test:
    stage: performance_eval
    allow_failure: true
    needs:
        - job: gpu_convergence_test
    parallel:
        matrix:
        - BACKEND: [cuda, hipsycl, hip]
    script:
        - *common_gpu_steps
        - set -euo pipefail
        - for precision in double single; do
            echo "[${BACKEND}] Running SeiSol proxy with precision - ${precision}" ;
            cd ./build_${BACKEND}_${precision} ;
            pip3 install -r ./proxy-runners/requirements.txt ;
            python3 ./proxy-runners/run-all.py -c 25000 -t 100 ;
            cat ./proxy_*.pd ;
            cd .. ;
          done;
        - set +u